-- cgit v1.2.3 From 179be588c25dccaa963df9c9c104fc6229435483 Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Fri, 18 Mar 2011 22:37:00 +0000 Subject: Initial checkin. git-svn-id: http://leveldb.googlecode.com/svn/trunk@2 62dab493-f737-651d-591e-8d6aee1b9529 --- AUTHORS | 8 + Android.mk | 64 +++ Application.mk | 6 + LICENSE | 27 + Makefile | 134 +++++ README | 51 ++ TODO | 23 + db/builder.cc | 97 ++++ db/builder.h | 36 ++ db/corruption_test.cc | 366 +++++++++++++ db/db_bench.cc | 376 ++++++++++++++ db/db_impl.cc | 1195 +++++++++++++++++++++++++++++++++++++++++++ db/db_impl.h | 192 +++++++ db/db_iter.cc | 412 +++++++++++++++ db/db_iter.h | 26 + db/db_test.cc | 963 ++++++++++++++++++++++++++++++++++ db/dbformat.cc | 152 ++++++ db/dbformat.h | 198 +++++++ db/dbformat_test.cc | 127 +++++ db/filename.cc | 154 ++++++ db/filename.h | 92 ++++ db/filename_test.cc | 156 ++++++ db/log_format.h | 35 ++ db/log_reader.cc | 172 +++++++ db/log_reader.h | 75 +++ db/log_test.cc | 361 +++++++++++++ db/log_writer.cc | 101 ++++ db/log_writer.h | 48 ++ db/memtable.cc | 109 ++++ db/memtable.h | 69 +++ db/repair.cc | 396 ++++++++++++++ db/skiplist.h | 378 ++++++++++++++ db/skiplist_test.cc | 378 ++++++++++++++ db/snapshot.h | 66 +++ db/table_cache.cc | 94 ++++ db/table_cache.h | 49 ++ db/version_edit.cc | 282 ++++++++++ db/version_edit.h | 118 +++++ db/version_edit_test.cc | 50 ++ db/version_set.cc | 1003 ++++++++++++++++++++++++++++++++++++ db/version_set.h | 290 +++++++++++ db/write_batch.cc | 164 ++++++ db/write_batch_internal.h | 73 +++ db/write_batch_test.cc | 110 ++++ doc/doc.css | 89 ++++ doc/impl.html | 222 ++++++++ doc/index.html | 508 ++++++++++++++++++ doc/log_format.txt | 72 +++ doc/table_format.txt | 61 +++ include/cache.h | 99 ++++ include/comparator.h | 61 +++ include/db.h | 137 +++++ include/env.h | 293 +++++++++++ include/iterator.h | 95 ++++ include/options.h | 203 ++++++++ include/slice.h | 104 ++++ include/status.h | 86 ++++ include/table.h | 67 +++ include/table_builder.h | 86 ++++ include/write_batch.h | 49 ++ leveldb.gyp | 329 ++++++++++++ port/README | 10 + port/port.h | 21 + port/port_android.cc | 65 +++ port/port_android.h | 131 +++++ port/port_chromium.cc | 83 +++ port/port_chromium.h | 104 ++++ port/port_example.h | 119 +++++ port/port_posix.cc | 50 ++ port/port_posix.h | 108 ++++ port/sha1_portable.cc | 298 +++++++++++ port/sha1_portable.h | 25 + port/sha1_test.cc | 55 ++ table/block.cc | 261 ++++++++++ table/block.h | 43 ++ table/block_builder.cc | 109 ++++ table/block_builder.h | 57 +++ table/format.cc | 131 +++++ table/format.h | 103 ++++ table/iterator.cc | 68 +++ table/iterator_wrapper.h | 64 +++ table/merger.cc | 143 ++++++ table/merger.h | 26 + table/table.cc | 175 +++++++ table/table_builder.cc | 224 ++++++++ table/table_test.cc | 808 +++++++++++++++++++++++++++++ table/two_level_iterator.cc | 182 +++++++ table/two_level_iterator.h | 34 ++ util/arena.cc | 68 +++ util/arena.h | 68 +++ util/arena_test.cc | 68 +++ util/cache.cc | 253 +++++++++ util/cache_test.cc | 169 ++++++ util/coding.cc | 194 +++++++ util/coding.h | 104 ++++ util/coding_test.cc | 173 +++++++ util/comparator.cc | 72 +++ util/crc32c.cc | 332 ++++++++++++ util/crc32c.h | 45 ++ util/crc32c_test.cc | 86 ++++ util/env.cc | 77 +++ util/env_chromium.cc | 608 ++++++++++++++++++++++ util/env_posix.cc | 609 ++++++++++++++++++++++ util/env_test.cc | 102 ++++ util/hash.cc | 45 ++ util/hash.h | 19 + util/histogram.cc | 128 +++++ util/histogram.h | 41 ++ util/logging.cc | 81 +++ util/logging.h | 47 ++ util/mutexlock.h | 39 ++ util/options.cc | 29 ++ util/random.h | 59 +++ util/status.cc | 59 +++ util/testharness.cc | 65 +++ util/testharness.h | 129 +++++ util/testutil.cc | 51 ++ util/testutil.h | 53 ++ 118 files changed, 19207 insertions(+) create mode 100644 AUTHORS create mode 100644 Android.mk create mode 100644 Application.mk create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README create mode 100644 TODO create mode 100644 db/builder.cc create mode 100644 db/builder.h create mode 100644 db/corruption_test.cc create mode 100644 db/db_bench.cc create mode 100644 db/db_impl.cc create mode 100644 db/db_impl.h create mode 100644 db/db_iter.cc create mode 100644 db/db_iter.h create mode 100644 db/db_test.cc create mode 100644 db/dbformat.cc create mode 100644 db/dbformat.h create mode 100644 db/dbformat_test.cc create mode 100644 db/filename.cc create mode 100644 db/filename.h create mode 100644 db/filename_test.cc create mode 100644 db/log_format.h create mode 100644 db/log_reader.cc create mode 100644 db/log_reader.h create mode 100644 db/log_test.cc create mode 100644 db/log_writer.cc create mode 100644 db/log_writer.h create mode 100644 db/memtable.cc create mode 100644 db/memtable.h create mode 100644 db/repair.cc create mode 100644 db/skiplist.h create mode 100644 db/skiplist_test.cc create mode 100644 db/snapshot.h create mode 100644 db/table_cache.cc create mode 100644 db/table_cache.h create mode 100644 db/version_edit.cc create mode 100644 db/version_edit.h create mode 100644 db/version_edit_test.cc create mode 100644 db/version_set.cc create mode 100644 db/version_set.h create mode 100644 db/write_batch.cc create mode 100644 db/write_batch_internal.h create mode 100644 db/write_batch_test.cc create mode 100644 doc/doc.css create mode 100644 doc/impl.html create mode 100644 doc/index.html create mode 100644 doc/log_format.txt create mode 100644 doc/table_format.txt create mode 100644 include/cache.h create mode 100644 include/comparator.h create mode 100644 include/db.h create mode 100644 include/env.h create mode 100644 include/iterator.h create mode 100644 include/options.h create mode 100644 include/slice.h create mode 100644 include/status.h create mode 100644 include/table.h create mode 100644 include/table_builder.h create mode 100644 include/write_batch.h create mode 100644 leveldb.gyp create mode 100644 port/README create mode 100644 port/port.h create mode 100644 port/port_android.cc create mode 100644 port/port_android.h create mode 100644 port/port_chromium.cc create mode 100644 port/port_chromium.h create mode 100644 port/port_example.h create mode 100644 port/port_posix.cc create mode 100644 port/port_posix.h create mode 100644 port/sha1_portable.cc create mode 100644 port/sha1_portable.h create mode 100644 port/sha1_test.cc create mode 100644 table/block.cc create mode 100644 table/block.h create mode 100644 table/block_builder.cc create mode 100644 table/block_builder.h create mode 100644 table/format.cc create mode 100644 table/format.h create mode 100644 table/iterator.cc create mode 100644 table/iterator_wrapper.h create mode 100644 table/merger.cc create mode 100644 table/merger.h create mode 100644 table/table.cc create mode 100644 table/table_builder.cc create mode 100644 table/table_test.cc create mode 100644 table/two_level_iterator.cc create mode 100644 table/two_level_iterator.h create mode 100644 util/arena.cc create mode 100644 util/arena.h create mode 100644 util/arena_test.cc create mode 100644 util/cache.cc create mode 100644 util/cache_test.cc create mode 100644 util/coding.cc create mode 100644 util/coding.h create mode 100644 util/coding_test.cc create mode 100644 util/comparator.cc create mode 100644 util/crc32c.cc create mode 100644 util/crc32c.h create mode 100644 util/crc32c_test.cc create mode 100644 util/env.cc create mode 100644 util/env_chromium.cc create mode 100644 util/env_posix.cc create mode 100644 util/env_test.cc create mode 100644 util/hash.cc create mode 100644 util/hash.h create mode 100644 util/histogram.cc create mode 100644 util/histogram.h create mode 100644 util/logging.cc create mode 100644 util/logging.h create mode 100644 util/mutexlock.h create mode 100644 util/options.cc create mode 100644 util/random.h create mode 100644 util/status.cc create mode 100644 util/testharness.cc create mode 100644 util/testharness.h create mode 100644 util/testutil.cc create mode 100644 util/testutil.h diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..27a9407 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,8 @@ +# Names should be added to this file like so: +# Name or Organization + +Google Inc. + +# Initial version authors: +Jeffrey Dean +Sanjay Ghemawat diff --git a/Android.mk b/Android.mk new file mode 100644 index 0000000..fa4a3de --- /dev/null +++ b/Android.mk @@ -0,0 +1,64 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +# INSTRUCTIONS +# After you've downloaded and installed the Android NDK from: +# http://developer.android.com/sdk/ndk/index.html +# 1. In the same directory as this file, Android.mk, type: +# $ ln -s leveldb ../jni +# (The Android NDK will only build native projects in +# subdirectories named "jni".) +# 2. $ cd .. +# 3. Execute ndk-build: +# $ $(ANDROID_NDK_DIR)/ndk-build + +LOCAL_PATH := $(call my-dir) + +include $(CLEAR_VARS) +LOCAL_MODULE := leveldb +# Build flags: +# - LEVELDB_PLATFORM_ANDROID to use the correct port header: port_android.h +LOCAL_CFLAGS := -DLEVELDB_PLATFORM_ANDROID -std=gnu++0x +LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../ +LOCAL_CPP_EXTENSION := .cc + +LOCAL_SRC_FILES := ./db/builder.cc \ +./db/db_bench.cc \ +./db/db_impl.cc \ +./db/db_iter.cc \ +./db/filename.cc \ +./db/dbformat.cc \ +./db/log_reader.cc \ +./db/log_writer.cc \ +./db/memtable.cc \ +./db/repair.cc \ +./db/table_cache.cc \ +./db/version_edit.cc \ +./db/version_set.cc \ +./db/write_batch.cc \ +./port/port_android.cc \ +./table/block.cc \ +./table/block_builder.cc \ +./table/format.cc \ +./table/iterator.cc \ +./table/merger.cc \ +./table/table.cc \ +./table/table_builder.cc \ +./table/two_level_iterator.cc \ +./util/arena.cc \ +./util/cache.cc \ +./util/coding.cc \ +./util/comparator.cc \ +./util/crc32c.cc \ +./util/env.cc \ +./util/env_posix.cc \ +./util/hash.cc \ +./util/histogram.cc \ +./util/logging.cc \ +./util/options.cc \ +./util/status.cc \ +./util/testharness.cc \ +./util/testutil.cc + +include $(BUILD_SHARED_LIBRARY) diff --git a/Application.mk b/Application.mk new file mode 100644 index 0000000..9360a38 --- /dev/null +++ b/Application.mk @@ -0,0 +1,6 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +APP_ABI := armeabi-v7a +APP_STL := gnustl_static diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8e80208 --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The LevelDB Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a60b4de --- /dev/null +++ b/Makefile @@ -0,0 +1,134 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +CC = g++ + +# Uncomment one of the following to switch between debug and opt mode +#OPT = -O2 -DNDEBUG +OPT = -g2 + +CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -std=c++0x $(OPT) + +LDFLAGS=-lpthread + +LIBOBJECTS = \ + ./db/builder.o \ + ./db/db_impl.o \ + ./db/db_iter.o \ + ./db/filename.o \ + ./db/format.o \ + ./db/log_reader.o \ + ./db/log_writer.o \ + ./db/memtable.o \ + ./db/repair.o \ + ./db/table_cache.o \ + ./db/version_edit.o \ + ./db/version_set.o \ + ./db/write_batch.o \ + ./port/port_posix.o \ + ./port/sha1_portable.o \ + ./table/block.o \ + ./table/block_builder.o \ + ./table/format.o \ + ./table/iterator.o \ + ./table/merger.o \ + ./table/table.o \ + ./table/table_builder.o \ + ./table/two_level_iterator.o \ + ./util/arena.o \ + ./util/cache.o \ + ./util/coding.o \ + ./util/comparator.o \ + ./util/crc32c.o \ + ./util/env.o \ + ./util/env_posix.o \ + ./util/hash.o \ + ./util/histogram.o \ + ./util/logging.o \ + ./util/options.o \ + ./util/status.o + +TESTUTIL = ./util/testutil.o +TESTHARNESS = ./util/testharness.o $(TESTUTIL) + +TESTS = \ + arena_test \ + cache_test \ + coding_test \ + corruption_test \ + crc32c_test \ + db_test \ + dbformat_test \ + env_test \ + filename_test \ + log_test \ + sha1_test \ + skiplist_test \ + table_test \ + version_edit_test \ + write_batch_test + +PROGRAMS = db_bench $(TESTS) + +all: $(PROGRAMS) + +check: $(TESTS) + for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done + +clean: + rm -f $(PROGRAMS) */*.o + +db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + +arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +.cc.o: + $(CC) $(CFLAGS) $< -o $@ + +# TODO(gabor): dependencies for .o files +# TODO(gabor): Build library diff --git a/README b/README new file mode 100644 index 0000000..c97e43c --- /dev/null +++ b/README @@ -0,0 +1,51 @@ +leveldb: A key-value store +Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) + +The code under this directory implements a system for maintaining a +persistent key/value store. + +See doc/index.html for more explanation. +See doc/db_layout.txt for a brief overview of the implementation. + +The public interface is in include/*.h. Callers should not include or +rely on the details of any other header files in this package. Those +internal APIs may be changed without warning. + +Guide to header files: + +include/db.h + Main interface to the DB: Start here + +include/options.h + Control over the behavior of an entire database, and also + control over the behavior of individual reads and writes. + +include/comparator.h + Abstraction for user-specified comparison function. If you want + just bytewise comparison of keys, you can use the default comparator, + but clients can write their own comparator implementations if they + want custom ordering (e.g. to handle different character + encodings, etc.) + +include/iterator.h + Interface for iterating over data. You can get an iterator + from a DB object. + +include/write_batch.h + Interface for atomically applying multiple updates to a database. + +include/slice.h + A simple module for maintaining a pointer and a length into some + other byte array. + +include/status.h + Status is returned from many of the public interfaces and is used + to report success and various kinds of errors. + +include/env.h + Abstraction of the OS environment. A posix implementation of + this interface is in util/env_posix.cc + +include/table.h +include/table_builder.h + Lower-level modules that most clients probably won't use directly diff --git a/TODO b/TODO new file mode 100644 index 0000000..7d60b5a --- /dev/null +++ b/TODO @@ -0,0 +1,23 @@ +Before adding to chrome +----------------------- +- multi-threaded test/benchmark +- Allow missing crc32c in Table format? + +Maybe afterwards +---------------- + +ss +- Stats +- Speed up backwards scan (avoid three passes over data) + +db +- Maybe implement DB::BulkDeleteForRange(start_key, end_key) + that would blow away files whose ranges are entirely contained + within [start_key..end_key]? For Chrome, deletion of obsolete + object stores, etc. can be done in the background anyway, so + probably not that important. + +api changes? +- Efficient large value reading and writing + +Faster Get implementation diff --git a/db/builder.cc b/db/builder.cc new file mode 100644 index 0000000..f3d0fe2 --- /dev/null +++ b/db/builder.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "include/db.h" +#include "include/env.h" +#include "include/iterator.h" + +namespace leveldb { + +Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + VersionEdit* edit) { + Status s; + meta->file_size = 0; + iter->SeekToFirst(); + + std::string fname = TableFileName(dbname, meta->number); + if (iter->Valid()) { + WritableFile* file; + s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + + TableBuilder* builder = new TableBuilder(options, file); + meta->smallest.DecodeFrom(iter->key()); + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + meta->largest.DecodeFrom(key); + if (ExtractValueType(key) == kTypeLargeValueRef) { + if (iter->value().size() != LargeValueRef::ByteSize()) { + s = Status::Corruption("invalid indirect reference hash value (L0)"); + break; + } + edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()), + meta->number, + iter->key()); + } + builder->Add(key, iter->value()); + } + + // Finish and check for builder errors + if (s.ok()) { + s = builder->Finish(); + if (s.ok()) { + meta->file_size = builder->FileSize(); + assert(meta->file_size > 0); + } + } else { + builder->Abandon(); + } + delete builder; + + // Finish and check for file errors + if (s.ok()) { + s = file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + delete file; + file = NULL; + + if (s.ok()) { + // Verify that the table is usable + Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number); + s = it->status(); + delete it; + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (s.ok() && meta->file_size > 0) { + edit->AddFile(0, meta->number, meta->file_size, + meta->smallest, meta->largest); + } else { + env->DeleteFile(fname); + } + return s; +} + +} diff --git a/db/builder.h b/db/builder.h new file mode 100644 index 0000000..2d8afdf --- /dev/null +++ b/db/builder.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ +#define STORAGE_LEVELDB_DB_BUILDER_H_ + +#include "include/status.h" + +namespace leveldb { + +struct Options; +struct FileMetaData; + +class Env; +class Iterator; +class TableCache; +class VersionEdit; + +// Build a Table file from the contents of *iter. The generated file +// will be named according to meta->number. On success, the rest of +// *meta will be filled with metadata about the generated table, and +// large value refs and the added file information will be added to +// *edit. If no data is present in *iter, meta->file_size will be set +// to zero, and no Table file will be produced. +extern Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + VersionEdit* edit); + +} + +#endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/db/corruption_test.cc b/db/corruption_test.cc new file mode 100644 index 0000000..a59ab0e --- /dev/null +++ b/db/corruption_test.cc @@ -0,0 +1,366 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/db.h" + +#include +#include +#include +#include +#include "include/env.h" +#include "include/table.h" +#include "include/write_batch.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static const int kValueSize = 1000; + +class CorruptionTest { + public: + test::ErrorEnv env_; + Random rnd_; + std::string dbname_; + Options options_; + DB* db_; + + CorruptionTest() : rnd_(test::RandomSeed()) { + options_.env = &env_; + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, options_); + + db_ = NULL; + options_.create_if_missing = true; + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() { + delete db_; + DestroyDB(dbname_, Options()); + } + + Status TryReopen(Options* options = NULL) { + delete db_; + db_ = NULL; + Options opt = (options ? *options : options_); + opt.env = &env_; + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); + } + + void RepairDB() { + delete db_; + db_ = NULL; + ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); + } + + void Build(int n) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Check(int min_expected, int max_expected) { + int next_expected = 0; + int missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || + !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(key, &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + + fprintf(stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", + min_expected, max_expected, correct, bad_keys, bad_values, missed); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + LargeValueRef large_ref; + FileType type; + std::vector candidates; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type) && + type == filetype) { + candidates.push_back(dbname_ + "/" + filenames[i]); + } + } + ASSERT_TRUE(!candidates.empty()) << filetype; + std::string fname = candidates[rnd_.Uniform(candidates.size())]; + + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + ASSERT_TRUE(false) << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = sbuf.st_size + offset; + } + } + if (offset > sbuf.st_size) { + offset = sbuf.st_size; + } + if (offset + bytes_to_corrupt > sbuf.st_size) { + bytes_to_corrupt = sbuf.st_size - offset; + } + + // Do it + std::string contents; + Status s = ReadFileToString(Env::Default(), fname, &contents); + ASSERT_TRUE(s.ok()) << s.ToString(); + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(Env::Default(), contents, fname); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + uint64_t Property(const std::string& name) { + uint64_t result; + if (!db_->GetProperty(name, &result)) { + result = ~static_cast(0); + } + return result; + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } +}; + +TEST(CorruptionTest, Recovery) { + Build(10); + Check(10, 10); + Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record + Corrupt(kLogFile, 2*kValueSize, 1); // Somewhere in second log record? + Reopen(); + Check(8, 8); +} + +TEST(CorruptionTest, RecoverWriteError) { + env_.writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_.writable_file_error_ = true; + const int num = 3 + (Options().write_buffer_size / kValueSize); + std::string value_storage; + Status s; + for (int i = 0; s.ok() && i < num; i++) { + WriteBatch batch; + batch.Put("a", Value(100, &value_storage)); + s = db_->Write(WriteOptions(), &batch); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_.num_writable_file_errors_, 1); + env_.writable_file_error_ = false; + Reopen(); +} + +TEST(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(1, "", "~"); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); +} + +TEST(CorruptionTest, TableFileIndexData) { + Build(10000); // Enough to build multiple Tables + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(1, "", "~"); + + Corrupt(kTableFile, -1000, 500); + Reopen(); + Check(5000, 9999); +} + +TEST(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST(CorruptionTest, LargeValueRecovery) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + std::string big; + ASSERT_OK(db_->Put(WriteOptions(), + "foo", test::RandomString(&rnd, 100000, &big))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); + + RepairDB(); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); + + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); +} + +TEST(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST(CorruptionTest, CompactionInputError) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); + dbi->TEST_CompactRange(0, "", "~"); + ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); +} + +TEST(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.paranoid_checks = true; + Reopen(&options); + + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + for (int i = 0; i < 10000 && s.ok(); i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + Corrupt(kTableFile, 100, 1); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + dbi->TEST_CompactMemTable(); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/db_bench.cc b/db/db_bench.cc new file mode 100644 index 0000000..4ccdd5a --- /dev/null +++ b/db/db_bench.cc @@ -0,0 +1,376 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "include/cache.h" +#include "include/db.h" +#include "include/env.h" +#include "include/write_batch.h" +#include "util/histogram.h" +#include "util/random.h" +#include "util/testutil.h" + +// Comma-separated list of operations to run in the specified order +// Actual benchmarks: +// writeseq -- write N values in sequential key order +// writerandom -- write N values in random key order +// writebig -- write N/1000 100K valuesin random order +// readseq -- read N values sequentially +// readrandom -- read N values in random order +// Meta operations: +// compact -- Compact the entire DB +// heapprofile -- Dump a heap profile (if supported by this port) +// sync -- switch to synchronous writes (not the default) +// nosync -- switch to asynchronous writes (the default) +// tenth -- divide N by 10 (i.e., following benchmarks are smaller) +// normal -- reset N back to its normal value (1000000) +static const char* FLAGS_benchmarks = + "writeseq," + "writeseq," + "writerandom," + "sync,tenth,tenth,writerandom,nosync,normal," + "readseq," + "readrandom," + "compact," + "readseq," + "readrandom," + "writebig"; + +// Number of key/values to place in database +static int FLAGS_num = 1000000; + +// Size of each value +static int FLAGS_value_size = 100; + +// Arrange to generate values that shrink to this fraction of +// their original size after compression +static double FLAGS_compression_ratio = 0.25; + +// Print histogram of operation timings +static bool FLAGS_histogram = false; + +// Number of bytes to buffer in memtable before compacting +static int FLAGS_write_buffer_size = 1 << 20; + +namespace leveldb { + +// Helper for quickly generating random data. +namespace { +class RandomGenerator { + private: + std::string data_; + int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < 1048576) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; +} + +class Benchmark { + private: + Cache* cache_; + DB* db_; + int num_; + bool sync_; + int heap_counter_; + double start_; + double last_op_finish_; + int64_t bytes_; + std::string message_; + Histogram hist_; + RandomGenerator gen_; + Random rand_; + + // State kept for progress messages + int done_; + int next_report_; // When to report next + + void Start() { + start_ = Env::Default()->NowMicros() * 1e-6; + bytes_ = 0; + message_.clear(); + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + next_report_ = 100; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = Env::Default()->NowMicros() * 1e-6; + double micros = (now - last_op_finish_) * 1e6; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (next_report_ < 1000) { + next_report_ += 100; + } else if (next_report_ < 10000) { + next_report_ += 1000; + } else if (next_report_ < 100000) { + next_report_ += 10000; + } else { + next_report_ += 100000; + } + fprintf(stderr, "... finished %d ops%30s\r", done_, ""); + fflush(stderr); + } + } + + void Stop(const Slice& name) { + double finish = Env::Default()->NowMicros() * 1e-6; + + // Pretend at least one op was done in case we are running a benchmark + // that does nto call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + if (bytes_ > 0) { + char rate[100]; + snprintf(rate, sizeof(rate), "%5.1f MB/s", + (bytes_ / 1048576.0) / (finish - start_)); + if (!message_.empty()) { + message_.push_back(' '); + } + message_.append(rate); + } + + fprintf(stdout, "%-12s : %10.3f micros/op;%s%s\n", + name.ToString().c_str(), + (finish - start_) * 1e6 / done_, + (message_.empty() ? "" : " "), + message_.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } + + public: + enum Order { SEQUENTIAL, RANDOM }; + + Benchmark() : cache_(NewLRUCache(200<<20)), + db_(NULL), + num_(FLAGS_num), + sync_(false), + heap_counter_(0), + bytes_(0), + rand_(301) { + std::vector files; + Env::Default()->GetChildren("/tmp/dbbench", &files); + for (int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); + } + } + DestroyDB("/tmp/dbbench", Options()); + } + + ~Benchmark() { + delete db_; + delete cache_; + } + + void Run() { + Options options; + options.create_if_missing = true; + options.max_open_files = 10000; + options.block_cache = cache_; + options.write_buffer_size = FLAGS_write_buffer_size; + + Start(); + Status s = DB::Open(options, "/tmp/dbbench", &db_); + Stop("open"); + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + + const char* benchmarks = FLAGS_benchmarks; + while (benchmarks != NULL) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == NULL) { + name = benchmarks; + benchmarks = NULL; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + Start(); + if (name == Slice("writeseq")) { + Write(SEQUENTIAL, num_, FLAGS_value_size); + } else if (name == Slice("writerandom")) { + Write(RANDOM, num_, FLAGS_value_size); + } else if (name == Slice("writebig")) { + Write(RANDOM, num_ / 1000, 100 * 1000); + } else if (name == Slice("readseq")) { + Read(SEQUENTIAL); + } else if (name == Slice("readrandom")) { + Read(RANDOM); + } else if (name == Slice("compact")) { + Compact(); + } else if (name == Slice("heapprofile")) { + HeapProfile(); + } else if (name == Slice("sync")) { + sync_ = true; + } else if (name == Slice("nosync")) { + sync_ = false; + } else if (name == Slice("tenth")) { + num_ = num_ / 10; + } else if (name == Slice("normal")) { + num_ = FLAGS_num; + } else { + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + } + Stop(name); + } + } + + void Write(Order order, int num_entries, int value_size) { + WriteBatch batch; + Status s; + std::string val; + WriteOptions options; + options.sync = sync_; + for (int i = 0; i < num_entries; i++) { + const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); + char key[100]; + snprintf(key, sizeof(key), "%012d", k); + batch.Clear(); + batch.Put(key, gen_.Generate(value_size)); + s = db_->Write(options, &batch); + bytes_ += value_size + strlen(key); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + FinishedSingleOp(); + } + } + + void Read(Order order) { + ReadOptions options; + if (order == SEQUENTIAL) { + Iterator* iter = db_->NewIterator(options); + int i = 0; + for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } else { + std::string value; + for (int i = 0; i < num_; i++) { + char key[100]; + const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); + snprintf(key, sizeof(key), "%012d", k); + db_->Get(options, key, &value); + FinishedSingleOp(); + } + } + } + + void Compact() { + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + uint64_t v; + char name[100]; + snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); + if (db_->GetProperty(name, &v) && v > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbi->TEST_CompactRange(level, "", "~"); + } + } + + static void WriteToFile(void* arg, const char* buf, int n) { + reinterpret_cast(arg)->Append(Slice(buf, n)); + } + + void HeapProfile() { + char fname[100]; + snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); + WritableFile* file; + Status s = Env::Default()->NewWritableFile(fname, &file); + if (!s.ok()) { + message_ = s.ToString(); + return; + } + bool ok = port::GetHeapProfile(WriteToFile, file); + delete file; + if (!ok) { + message_ = "not supported"; + Env::Default()->DeleteFile(fname); + } + } +}; + +} + +int main(int argc, char** argv) { + for (int i = 1; i < argc; i++) { + double d; + int n; + char junk; + if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { + FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); + } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { + FLAGS_compression_ratio = d; + } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_histogram = n; + } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { + FLAGS_num = n; + } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { + FLAGS_value_size = n; + } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } else { + fprintf(stderr, "Invalid flag '%s'\n", argv[i]); + exit(1); + } + } + + leveldb::Benchmark benchmark; + benchmark.Run(); + return 0; +} diff --git a/db/db_impl.cc b/db/db_impl.cc new file mode 100644 index 0000000..5008af0 --- /dev/null +++ b/db/db_impl.cc @@ -0,0 +1,1195 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" + +#include +#include +#include +#include +#include +#include +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "include/db.h" +#include "include/env.h" +#include "include/status.h" +#include "include/table.h" +#include "include/table_builder.h" +#include "port/port.h" +#include "table/block.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/mutexlock.h" + +namespace leveldb { + +struct DBImpl::CompactionState { + Compaction* const compaction; + + // Sequence numbers < smallest_snapshot are not significant since we + // will never have to service a snapshot below smallest_snapshot. + // Therefore if we have seen a sequence number S <= smallest_snapshot, + // we can drop all entries for the same key with sequence numbers < S. + SequenceNumber smallest_snapshot; + + // Files produced by compaction + struct Output { + uint64_t number; + uint64_t file_size; + InternalKey smallest, largest; + }; + std::vector outputs; + + // State kept for output being generated + WritableFile* outfile; + TableBuilder* builder; + + uint64_t total_bytes; + + Output* current_output() { return &outputs[outputs.size()-1]; } + + explicit CompactionState(Compaction* c) + : compaction(c), + outfile(NULL), + builder(NULL), + total_bytes(0) { + } +}; + +namespace { +class NullWritableFile : public WritableFile { + public: + virtual Status Append(const Slice& data) { return Status::OK(); } + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } +}; +} + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (*ptr > maxvalue) *ptr = maxvalue; + if (*ptr < minvalue) *ptr = minvalue; +} +Options SanitizeOptions(const std::string& dbname, + const InternalKeyComparator* icmp, + const Options& src) { + Options result = src; + result.comparator = icmp; + ClipToRange(&result.max_open_files, 20, 50000); + ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); + ClipToRange(&result.large_value_threshold, 16<<10, 1<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); + if (result.info_log == NULL) { + // Open a log file in the same directory as the db + src.env->CreateDir(dbname); // In case it does not exist + src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); + Status s = src.env->NewWritableFile(InfoLogFileName(dbname), + &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = new NullWritableFile; + } + } + return result; +} + +DBImpl::DBImpl(const Options& options, const std::string& dbname) + : env_(options.env), + internal_comparator_(options.comparator), + options_(SanitizeOptions(dbname, &internal_comparator_, options)), + owns_info_log_(options_.info_log != options.info_log), + dbname_(dbname), + db_lock_(NULL), + shutting_down_(NULL), + bg_cv_(&mutex_), + compacting_cv_(&mutex_), + last_sequence_(0), + mem_(new MemTable(internal_comparator_)), + logfile_(NULL), + log_(NULL), + log_number_(0), + bg_compaction_scheduled_(false), + compacting_(false) { + // Reserve ten files or so for other uses and give the rest to TableCache. + const int table_cache_size = options.max_open_files - 10; + table_cache_ = new TableCache(dbname_, &options_, table_cache_size); + + versions_ = new VersionSet(dbname_, &options_, table_cache_, + &internal_comparator_); +} + +DBImpl::~DBImpl() { + // Wait for background work to finish + mutex_.Lock(); + shutting_down_.Release_Store(this); // Any non-NULL value is ok + if (bg_compaction_scheduled_) { + while (bg_compaction_scheduled_) { + bg_cv_.Wait(); + } + } + mutex_.Unlock(); + + if (db_lock_ != NULL) { + env_->UnlockFile(db_lock_); + } + + delete versions_; + delete mem_; + delete log_; + delete logfile_; + delete table_cache_; + + if (owns_info_log_) { + delete options_.info_log; + } +} + +Status DBImpl::NewDB() { + assert(log_number_ == 0); + assert(last_sequence_ == 0); + + VersionEdit new_db; + new_db.SetComparatorName(user_comparator()->Name()); + new_db.SetLogNumber(log_number_); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + WritableFile* file; + Status s = env_->NewWritableFile(manifest, &file); + if (!s.ok()) { + return s; + } + { + log::Writer log(file); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = file->Close(); + } + } + delete file; + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(manifest); + } + return s; +} + +Status DBImpl::Install(VersionEdit* edit, + uint64_t new_log_number, + MemTable* cleanup_mem) { + mutex_.AssertHeld(); + edit->SetLogNumber(new_log_number); + edit->SetLastSequence(last_sequence_); + return versions_->LogAndApply(edit, cleanup_mem); +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || options_.paranoid_checks) { + // No change needed + } else { + Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str()); + *s = Status::OK(); + } +} + +void DBImpl::DeleteObsoleteFiles() { + // Make a set of all of the live files + std::set live = pending_outputs_; + versions_->AddLiveFiles(&live); + + versions_->CleanupLargeValueRefs(live, log_number_); + + std::vector filenames; + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + bool keep = true; + switch (type) { + case kLogFile: + keep = (number == log_number_); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (in case there is a race that allows other incarnations) + keep = (number >= versions_->ManifestFileNumber()); + break; + case kTableFile: + keep = (live.find(number) != live.end()); + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live" + keep = (live.find(number) != live.end()); + break; + case kLargeValueFile: + keep = versions_->LargeValueIsLive(large_ref); + break; + case kCurrentFile: + case kDBLockFile: + case kInfoLogFile: + keep = true; + break; + } + + if (!keep) { + if (type == kTableFile) { + table_cache_->Evict(number); + } + Log(env_, options_.info_log, "Delete type=%d #%lld\n", + int(type), + static_cast(number)); + env_->DeleteFile(dbname_ + "/" + filenames[i]); + } + } + } +} + +Status DBImpl::Recover(VersionEdit* edit) { + mutex_.AssertHeld(); + + // Ignore error from CreateDir since the creation of the DB is + // committed only when the descriptor is created, and this directory + // may already exist from a previous failed creation attempt. + env_->CreateDir(dbname_); + assert(db_lock_ == NULL); + Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + if (!env_->FileExists(CurrentFileName(dbname_))) { + if (options_.create_if_missing) { + s = NewDB(); + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + dbname_, "does not exist (create_if_missing is false)"); + } + } else { + if (options_.error_if_exists) { + return Status::InvalidArgument( + dbname_, "exists (error_if_exists is true)"); + } + } + + s = versions_->Recover(&log_number_, &last_sequence_); + if (s.ok()) { + // Recover from the log file named in the descriptor + SequenceNumber max_sequence(0); + if (log_number_ != 0) { // log_number_ == 0 indicates initial empty state + s = RecoverLogFile(log_number_, edit, &max_sequence); + } + if (s.ok()) { + last_sequence_ = + last_sequence_ > max_sequence ? last_sequence_ : max_sequence; + } + } + + return s; +} + +Status DBImpl::RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + WritableFile* info_log; + const char* fname; + Status* status; // NULL if options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) { + Log(env, info_log, "%s%s: dropping %d bytes; %s", + (this->status == NULL ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != NULL && this->status->ok()) *this->status = s; + } + }; + + mutex_.AssertHeld(); + + // Open the log file + std::string fname = LogFileName(dbname_, log_number); + SequentialFile* file; + Status status = env_->NewSequentialFile(fname, &file); + if (!status.ok()) { + MaybeIgnoreError(&status); + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks ? &status : NULL); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(file, &reporter, true/*checksum*/); + Log(env_, options_.info_log, "Recovering log #%llu", + (unsigned long long) log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable* mem = NULL; + while (reader.ReadRecord(&record, &scratch) && + status.ok()) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + + if (mem == NULL) { + mem = new MemTable(internal_comparator_); + } + status = WriteBatchInternal::InsertInto(&batch, mem); + MaybeIgnoreError(&status); + if (!status.ok()) { + break; + } + const SequenceNumber last_seq = + WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { + status = WriteLevel0Table(mem, edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + break; + } + delete mem; + mem = NULL; + } + } + + if (status.ok() && mem != NULL) { + status = WriteLevel0Table(mem, edit); + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + } + + delete mem; + delete file; + return status; +} + +Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + pending_outputs_.insert(meta.number); + Iterator* iter = mem->NewIterator(); + Log(env_, options_.info_log, "Level-0 table #%llu: started", + (unsigned long long) meta.number); + Status s = BuildTable(dbname_, env_, options_, table_cache_, + iter, &meta, edit); + Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s", + (unsigned long long) meta.number, + (unsigned long long) meta.file_size, + s.ToString().c_str()); + delete iter; + pending_outputs_.erase(meta.number); + return s; +} + +Status DBImpl::CompactMemTable() { + mutex_.AssertHeld(); + + WritableFile* lfile = NULL; + uint64_t new_log_number = versions_->NewFileNumber(); + + VersionEdit edit; + + // Save the contents of the memtable as a new Table + Status s = WriteLevel0Table(mem_, &edit); + if (s.ok()) { + s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); + } + + // Save a new descriptor with the new table and log number. + if (s.ok()) { + s = Install(&edit, new_log_number, mem_); + } + + if (s.ok()) { + // Commit to the new state + mem_ = new MemTable(internal_comparator_); + delete log_; + delete logfile_; + logfile_ = lfile; + log_ = new log::Writer(lfile); + log_number_ = new_log_number; + DeleteObsoleteFiles(); + MaybeScheduleCompaction(); + } else { + delete lfile; + env_->DeleteFile(LogFileName(dbname_, new_log_number)); + } + return s; +} + +void DBImpl::TEST_CompactRange( + int level, + const std::string& begin, + const std::string& end) { + MutexLock l(&mutex_); + while (compacting_) { + compacting_cv_.Wait(); + } + Compaction* c = versions_->CompactRange( + level, + InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek), + InternalKey(end, 0, static_cast(0))); + + if (c != NULL) { + CompactionState* compact = new CompactionState(c); + DoCompactionWork(compact); // Ignore error in test compaction + CleanupCompaction(compact); + } + + // Start any background compaction that may have been delayed by this thread + MaybeScheduleCompaction(); +} + +Status DBImpl::TEST_CompactMemTable() { + MutexLock l(&mutex_); + return CompactMemTable(); +} + +void DBImpl::MaybeScheduleCompaction() { + mutex_.AssertHeld(); + if (bg_compaction_scheduled_) { + // Already scheduled + } else if (compacting_) { + // Some other thread is running a compaction. Do not conflict with it. + } else if (shutting_down_.Acquire_Load()) { + // DB is being deleted; no more background compactions + } else if (!versions_->NeedsCompaction()) { + // No work to be done + } else { + bg_compaction_scheduled_ = true; + env_->Schedule(&DBImpl::BGWork, this); + } +} + +void DBImpl::BGWork(void* db) { + reinterpret_cast(db)->BackgroundCall(); +} + +void DBImpl::BackgroundCall() { + MutexLock l(&mutex_); + assert(bg_compaction_scheduled_); + if (!shutting_down_.Acquire_Load() && + !compacting_) { + BackgroundCompaction(); + } + bg_compaction_scheduled_ = false; + bg_cv_.SignalAll(); + + // Previous compaction may have produced too many files in a level, + // so reschedule another compaction if needed. + MaybeScheduleCompaction(); +} + +void DBImpl::BackgroundCompaction() { + mutex_.AssertHeld(); + Compaction* c = versions_->PickCompaction(); + if (c == NULL) { + // Nothing to do + return; + } + + Status status; + if (c->num_input_files(0) == 1 && c->num_input_files(1) == 0) { + // Move file to next level + FileMetaData* f = c->input(0, 0); + c->edit()->DeleteFile(c->level(), f->number); + c->edit()->AddFile(c->level() + 1, f->number, f->file_size, + f->smallest, f->largest); + status = Install(c->edit(), log_number_, NULL); + Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", + static_cast(f->number), + c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str()); + } else { + CompactionState* compact = new CompactionState(c); + status = DoCompactionWork(compact); + CleanupCompaction(compact); + } + delete c; + + if (status.ok()) { + // Done + } else if (shutting_down_.Acquire_Load()) { + // Ignore compaction errors found during shutting down + } else { + Log(env_, options_.info_log, + "Compaction error: %s", status.ToString().c_str()); + if (options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } +} + +void DBImpl::CleanupCompaction(CompactionState* compact) { + mutex_.AssertHeld(); + if (compact->builder != NULL) { + // May happen if we get a shutdown call in the middle of compaction + compact->builder->Abandon(); + delete compact->builder; + } else { + assert(compact->outfile == NULL); + } + delete compact->outfile; + for (int i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + pending_outputs_.erase(out.number); + } + delete compact; +} + +Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { + assert(compact != NULL); + assert(compact->builder == NULL); + uint64_t file_number; + { + mutex_.Lock(); + file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + CompactionState::Output out; + out.number = file_number; + out.smallest.Clear(); + out.largest.Clear(); + compact->outputs.push_back(out); + mutex_.Unlock(); + } + + // Make the output file + std::string fname = TableFileName(dbname_, file_number); + Status s = env_->NewWritableFile(fname, &compact->outfile); + if (s.ok()) { + compact->builder = new TableBuilder(options_, compact->outfile); + } + return s; +} + +Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, + Iterator* input) { + assert(compact != NULL); + assert(compact->outfile != NULL); + assert(compact->builder != NULL); + + const uint64_t output_number = compact->current_output()->number; + assert(output_number != 0); + + // Check for iterator errors + Status s = input->status(); + const uint64_t current_entries = compact->builder->NumEntries(); + if (s.ok()) { + s = compact->builder->Finish(); + } else { + compact->builder->Abandon(); + } + const uint64_t current_bytes = compact->builder->FileSize(); + compact->current_output()->file_size = current_bytes; + compact->total_bytes += current_bytes; + delete compact->builder; + compact->builder = NULL; + + // Finish and check for file errors + if (s.ok()) { + s = compact->outfile->Sync(); + } + if (s.ok()) { + s = compact->outfile->Close(); + } + delete compact->outfile; + compact->outfile = NULL; + + if (s.ok() && current_entries > 0) { + // Verify that the table is usable + Iterator* iter = table_cache_->NewIterator(ReadOptions(),output_number); + s = iter->status(); + delete iter; + if (s.ok()) { + Log(env_, options_.info_log, + "Generated table #%llu: %lld keys, %lld bytes", + (unsigned long long) output_number, + (unsigned long long) current_entries, + (unsigned long long) current_bytes); + } + } + return s; +} + + +Status DBImpl::InstallCompactionResults(CompactionState* compact) { + mutex_.AssertHeld(); + Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1, + static_cast(compact->total_bytes)); + + // Add compaction outputs + compact->compaction->AddInputDeletions(compact->compaction->edit()); + const int level = compact->compaction->level(); + for (int i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + compact->compaction->edit()->AddFile( + level + 1, + out.number, out.file_size, out.smallest, out.largest); + pending_outputs_.erase(out.number); + } + compact->outputs.clear(); + + Status s = Install(compact->compaction->edit(), log_number_, NULL); + if (s.ok()) { + compact->compaction->ReleaseInputs(); + DeleteObsoleteFiles(); + } else { + // Discard any files we may have created during this failed compaction + for (int i = 0; i < compact->outputs.size(); i++) { + env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); + } + } + return s; +} + +Status DBImpl::DoCompactionWork(CompactionState* compact) { + Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1); + + assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); + assert(compact->builder == NULL); + assert(compact->outfile == NULL); + if (snapshots_.empty()) { + compact->smallest_snapshot = last_sequence_; + } else { + compact->smallest_snapshot = snapshots_.oldest()->number_; + } + + // Release mutex while we're actually doing the compaction work + compacting_ = true; + mutex_.Unlock(); + + Iterator* input = versions_->MakeInputIterator(compact->compaction); + input->SeekToFirst(); + Status status; + ParsedInternalKey ikey; + std::string current_user_key; + bool has_current_user_key = false; + SequenceNumber last_sequence_for_key = kMaxSequenceNumber; + for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { + // Handle key/value, add to state, etc. + Slice key = input->key(); + bool drop = false; + if (!ParseInternalKey(key, &ikey)) { + // Do not hide error keys + current_user_key.clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + } else { + if (!has_current_user_key || + user_comparator()->Compare(ikey.user_key, + Slice(current_user_key)) != 0) { + // First occurrence of this user key + current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + } + + if (last_sequence_for_key <= compact->smallest_snapshot) { + // Hidden by an newer entry for same user key + drop = true; // (A) + } else if (ikey.type == kTypeDeletion && + ikey.sequence <= compact->smallest_snapshot && + compact->compaction->IsBaseLevelForKey(ikey.user_key)) { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + } + + last_sequence_for_key = ikey.sequence; + } +#if 0 + Log(env_, options_.info_log, + " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " + "%d smallest_snapshot: %d", + ikey.user_key.ToString().c_str(), + (int)ikey.sequence, ikey.type, kTypeLargeValueRef, drop, + compact->compaction->IsBaseLevelForKey(ikey.user_key), + (int)last_sequence_for_key, (int)compact->smallest_snapshot); +#endif + + if (!drop) { + // Open output file if necessary + if (compact->builder == NULL) { + status = OpenCompactionOutputFile(compact); + if (!status.ok()) { + break; + } + } + if (compact->builder->NumEntries() == 0) { + compact->current_output()->smallest.DecodeFrom(key); + } + compact->current_output()->largest.DecodeFrom(key); + + if (ikey.type == kTypeLargeValueRef) { + if (input->value().size() != LargeValueRef::ByteSize()) { + if (options_.paranoid_checks) { + status = Status::Corruption("invalid large value ref"); + break; + } else { + Log(env_, options_.info_log, + "compaction found invalid large value ref"); + } + } else { + compact->compaction->edit()->AddLargeValueRef( + LargeValueRef::FromRef(input->value()), + compact->current_output()->number, + input->key()); + compact->builder->Add(key, input->value()); + } + } else { + compact->builder->Add(key, input->value()); + } + + // Close output file if it is big enough + if (compact->builder->FileSize() >= + compact->compaction->MaxOutputFileSize()) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + } + + input->Next(); + } + + if (status.ok() && shutting_down_.Acquire_Load()) { + status = Status::IOError("Deleting DB during compaction"); + } + if (status.ok() && compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + } + if (status.ok()) { + status = input->status(); + } + delete input; + input = NULL; + + mutex_.Lock(); + + if (status.ok()) { + status = InstallCompactionResults(compact); + } + compacting_ = false; + compacting_cv_.SignalAll(); + return status; +} + +Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, + SequenceNumber* latest_snapshot) { + mutex_.Lock(); + *latest_snapshot = last_sequence_; + + // Collect together all needed child iterators + std::vector list; + list.push_back(mem_->NewIterator()); + versions_->current()->AddIterators(options, &list); + Iterator* internal_iter = + NewMergingIterator(&internal_comparator_, &list[0], list.size()); + versions_->current()->Ref(); + internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current()); + + mutex_.Unlock(); + return internal_iter; +} + +Iterator* DBImpl::TEST_NewInternalIterator() { + SequenceNumber ignored; + return NewInternalIterator(ReadOptions(), &ignored); +} + +Status DBImpl::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + // TODO(opt): faster implementation + Iterator* iter = NewIterator(options); + iter->Seek(key); + bool found = false; + if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) { + Slice v = iter->value(); + value->assign(v.data(), v.size()); + found = true; + } + // Non-OK iterator status trumps everything else + Status result = iter->status(); + if (result.ok() && !found) { + result = Status::NotFound(Slice()); // Use an empty error message for speed + } + delete iter; + return result; +} + +Iterator* DBImpl::NewIterator(const ReadOptions& options) { + SequenceNumber latest_snapshot; + Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + SequenceNumber sequence = + (options.snapshot ? options.snapshot->number_ : latest_snapshot); + return NewDBIterator(&dbname_, env_, + user_comparator(), internal_iter, sequence); +} + +void DBImpl::Unref(void* arg1, void* arg2) { + DBImpl* impl = reinterpret_cast(arg1); + Version* v = reinterpret_cast(arg2); + MutexLock l(&impl->mutex_); + v->Unref(); +} + +const Snapshot* DBImpl::GetSnapshot() { + MutexLock l(&mutex_); + return snapshots_.New(last_sequence_); +} + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + MutexLock l(&mutex_); + snapshots_.Delete(s); +} + +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { + return DB::Put(o, key, val); +} + +Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + return DB::Delete(options, key); +} + +Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { + Status status; + + WriteBatch* final = NULL; + { + MutexLock l(&mutex_); + if (!bg_error_.ok()) { + status = bg_error_; + } else if (mem_->ApproximateMemoryUsage() > options_.write_buffer_size) { + status = CompactMemTable(); + } + if (status.ok()) { + status = HandleLargeValues(last_sequence_ + 1, updates, &final); + } + if (status.ok()) { + WriteBatchInternal::SetSequence(final, last_sequence_ + 1); + last_sequence_ += WriteBatchInternal::Count(final); + + // Add to log and apply to memtable + status = log_->AddRecord(WriteBatchInternal::Contents(final)); + if (status.ok() && options.sync) { + status = logfile_->Sync(); + } + if (status.ok()) { + status = WriteBatchInternal::InsertInto(final, mem_); + } + } + + if (options.post_write_snapshot != NULL) { + *options.post_write_snapshot = + status.ok() ? snapshots_.New(last_sequence_) : NULL; + } + } + if (final != updates) { + delete final; + } + + return status; +} + +bool DBImpl::HasLargeValues(const WriteBatch& batch) const { + if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) { + for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) { + if (it.op() == kTypeValue && + it.value().size() >= options_.large_value_threshold) { + return true; + } + } + } + return false; +} + +// Given "raw_value", determines the appropriate compression format to use +// and stores the data that should be written to the large value file in +// "*file_bytes", and sets "*ref" to the appropriate large value reference. +// May use "*scratch" as backing store for "*file_bytes". +void DBImpl::MaybeCompressLargeValue( + const Slice& raw_value, + Slice* file_bytes, + std::string* scratch, + LargeValueRef* ref) { + switch (options_.compression) { + case kLightweightCompression: { + port::Lightweight_Compress(raw_value.data(), raw_value.size(), scratch); + if (scratch->size() < (raw_value.size() / 8) * 7) { + *file_bytes = *scratch; + *ref = LargeValueRef::Make(raw_value, kLightweightCompression); + return; + } + + // Less than 12.5% compression: just leave as uncompressed data + break; + } + case kNoCompression: + // Use default code outside of switch + break; + } + // Store as uncompressed data + *file_bytes = raw_value; + *ref = LargeValueRef::Make(raw_value, kNoCompression); +} + +Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq, + WriteBatch* updates, + WriteBatch** final) { + if (!HasLargeValues(*updates)) { + // Fast path: no large values found + *final = updates; + } else { + // Copy *updates to a new WriteBatch, replacing the references to + *final = new WriteBatch; + SequenceNumber seq = assigned_seq; + for (WriteBatchInternal::Iterator it(*updates); !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeValue: + if (it.value().size() < options_.large_value_threshold) { + (*final)->Put(it.key(), it.value()); + } else { + std::string scratch; + Slice file_bytes; + LargeValueRef large_ref; + MaybeCompressLargeValue( + it.value(), &file_bytes, &scratch, &large_ref); + InternalKey ikey(it.key(), seq, kTypeLargeValueRef); + if (versions_->RegisterLargeValueRef(large_ref, log_number_,ikey)) { + // TODO(opt): avoid holding the lock here (but be careful about + // another thread doing a Write and changing log_number_ or + // having us get a different "assigned_seq" value). + + uint64_t tmp_number = versions_->NewFileNumber(); + pending_outputs_.insert(tmp_number); + std::string tmp = TempFileName(dbname_, tmp_number); + WritableFile* file; + Status s = env_->NewWritableFile(tmp, &file); + if (!s.ok()) { + return s; // Caller will delete *final + } + + file->Append(file_bytes); + + s = file->Close(); + delete file; + + if (s.ok()) { + const std::string fname = + LargeValueFileName(dbname_, large_ref); + s = env_->RenameFile(tmp, fname); + } else { + Log(env_, options_.info_log, "Write large value: %s", + s.ToString().c_str()); + } + pending_outputs_.erase(tmp_number); + + if (!s.ok()) { + env_->DeleteFile(tmp); // Cleanup; intentionally ignoring error + return s; // Caller will delete *final + } + } + + // Put an indirect reference in the write batch in place + // of large value + WriteBatchInternal::PutLargeValueRef(*final, it.key(), large_ref); + } + break; + case kTypeLargeValueRef: + return Status::Corruption("Corrupted write batch"); + break; + case kTypeDeletion: + (*final)->Delete(it.key()); + break; + } + seq = seq + 1; + } + } + return Status::OK(); +} + +bool DBImpl::GetProperty(const Slice& property, uint64_t* value) { + MutexLock l(&mutex_); + Slice in = property; + Slice prefix("leveldb."); + if (!in.starts_with(prefix)) return false; + in.remove_prefix(prefix.size()); + + if (in.starts_with("num-files-at-level")) { + in.remove_prefix(strlen("num-files-at-level")); + uint64_t level; + bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); + if (!ok || level < 0 || level >= config::kNumLevels) { + return false; + } else { + *value = versions_->NumLevelFiles(level); + return true; + } + } + return false; +} + +void DBImpl::GetApproximateSizes( + const Range* range, int n, + uint64_t* sizes) { + // TODO(opt): better implementation + Version* v; + { + MutexLock l(&mutex_); + versions_->current()->Ref(); + v = versions_->current(); + } + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + uint64_t start = versions_->ApproximateOffsetOf(v, k1); + uint64_t limit = versions_->ApproximateOffsetOf(v, k2); + sizes[i] = (limit >= start ? limit - start : 0); + } + + { + MutexLock l(&mutex_); + v->Unref(); + } +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { + WriteBatch batch; + batch.Put(key, value); + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, const Slice& key) { + WriteBatch batch; + batch.Delete(key); + return Write(opt, &batch); +} + +DB::~DB() { } + +Status DB::Open(const Options& options, const std::string& dbname, + DB** dbptr) { + *dbptr = NULL; + + DBImpl* impl = new DBImpl(options, dbname); + impl->mutex_.Lock(); + VersionEdit edit; + Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists + if (s.ok()) { + impl->log_number_ = impl->versions_->NewFileNumber(); + WritableFile* lfile; + s = options.env->NewWritableFile(LogFileName(dbname, impl->log_number_), + &lfile); + if (s.ok()) { + impl->logfile_ = lfile; + impl->log_ = new log::Writer(lfile); + s = impl->Install(&edit, impl->log_number_, NULL); + } + if (s.ok()) { + impl->DeleteObsoleteFiles(); + } + } + impl->mutex_.Unlock(); + if (s.ok()) { + *dbptr = impl; + } else { + delete impl; + } + return s; +} + +Status DestroyDB(const std::string& dbname, const Options& options) { + Env* env = options.env; + std::vector filenames; + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + if (filenames.empty()) { + return Status::OK(); + } + + FileLock* lock; + Status result = env->LockFile(LockFileName(dbname), &lock); + if (result.ok()) { + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + Status del = env->DeleteFile(dbname + "/" + filenames[i]); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(LockFileName(dbname)); + env->DeleteDir(dbname); // Ignore error in case dir contains other files + } + return result; +} + +} diff --git a/db/db_impl.h b/db/db_impl.h new file mode 100644 index 0000000..fc3d3f2 --- /dev/null +++ b/db/db_impl.h @@ -0,0 +1,192 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ +#define STORAGE_LEVELDB_DB_DB_IMPL_H_ + +#include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "include/db.h" +#include "include/env.h" +#include "port/port.h" + +namespace leveldb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +class DBImpl : public DB { + public: + DBImpl(const Options& options, const std::string& dbname); + virtual ~DBImpl(); + + // Implementations of the DB interface + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); + virtual Status Delete(const WriteOptions&, const Slice& key); + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value); + virtual Iterator* NewIterator(const ReadOptions&); + virtual const Snapshot* GetSnapshot(); + virtual void ReleaseSnapshot(const Snapshot* snapshot); + virtual bool GetProperty(const Slice& property, uint64_t* value); + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); + + // Extra methods (for testing) that are not in the public DB interface + + // Compact any files in the named level that overlap [begin,end] + void TEST_CompactRange( + int level, + const std::string& begin, + const std::string& end); + + // Force current memtable contents to be compacted. + Status TEST_CompactMemTable(); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + Iterator* TEST_NewInternalIterator(); + + private: + friend class DB; + + Iterator* NewInternalIterator(const ReadOptions&, + SequenceNumber* latest_snapshot); + + Status NewDB(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + Status Recover(VersionEdit* edit); + + // Apply the specified updates and save the resulting descriptor to + // persistent storage. If cleanup_mem is non-NULL, arrange to + // delete it when all existing snapshots have gone away iff Install() + // returns OK. + Status Install(VersionEdit* edit, + uint64_t new_log_number, + MemTable* cleanup_mem); + + void MaybeIgnoreError(Status* s) const; + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + + // Called when an iterator over a particular version of the + // descriptor goes away. + static void Unref(void* arg1, void* arg2); + + // Compact the in-memory write buffer to disk. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. + Status CompactMemTable(); + + Status RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence); + + Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); + + bool HasLargeValues(const WriteBatch& batch) const; + + // Process data in "*updates" and return a status. "assigned_seq" + // is the sequence number assigned to the first mod in "*updates". + // If no large values are encountered, "*final" is set to "updates". + // If large values were encountered, registers the references of the + // large values with the VersionSet, writes the large values to + // files (if appropriate), and allocates a new WriteBatch with the + // large values replaced with indirect references and stores a + // pointer to the new WriteBatch in *final. If *final != updates on + // return, then the client should delete *final when no longer + // needed. Returns OK on success, and an appropriate error + // otherwise. + Status HandleLargeValues(SequenceNumber assigned_seq, + WriteBatch* updates, + WriteBatch** final); + + // Helper routine for HandleLargeValues + void MaybeCompressLargeValue( + const Slice& raw_value, + Slice* file_bytes, + std::string* scratch, + LargeValueRef* ref); + + struct CompactionState; + + void MaybeScheduleCompaction(); + static void BGWork(void* db); + void BackgroundCall(); + void BackgroundCompaction(); + void CleanupCompaction(CompactionState* compact); + Status DoCompactionWork(CompactionState* compact); + + Status OpenCompactionOutputFile(CompactionState* compact); + Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); + Status InstallCompactionResults(CompactionState* compact); + + // Constant after construction + Env* const env_; + const InternalKeyComparator internal_comparator_; + const Options options_; // options_.comparator == &internal_comparator_ + bool owns_info_log_; + const std::string dbname_; + + // table_cache_ provides its own synchronization + TableCache* table_cache_; + + // Lock over the persistent DB state. Non-NULL iff successfully acquired. + FileLock* db_lock_; + + // State below is protected by mutex_ + port::Mutex mutex_; + port::AtomicPointer shutting_down_; + port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ + port::CondVar compacting_cv_; // Signalled when !compacting_ + SequenceNumber last_sequence_; + MemTable* mem_; + WritableFile* logfile_; + log::Writer* log_; + uint64_t log_number_; + SnapshotList snapshots_; + + // Set of table files to protect from deletion because they are + // part of ongoing compactions. + std::set pending_outputs_; + + // Has a background compaction been scheduled or is running? + bool bg_compaction_scheduled_; + + // Is there a compaction running? + bool compacting_; + + VersionSet* versions_; + + // Have we encountered a background error in paranoid mode? + Status bg_error_; + + // No copying allowed + DBImpl(const DBImpl&); + void operator=(const DBImpl&); + + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } +}; + +// Sanitize db options. The caller should delete result.info_log if +// it is not equal to src.info_log. +extern Options SanitizeOptions(const std::string& db, + const InternalKeyComparator* icmp, + const Options& src); + +} + +#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/db/db_iter.cc b/db/db_iter.cc new file mode 100644 index 0000000..c23de22 --- /dev/null +++ b/db/db_iter.cc @@ -0,0 +1,412 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "include/env.h" +#include "include/iterator.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/mutexlock.h" + +namespace leveldb { + +#if 0 +static void DumpInternalIter(Iterator* iter) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey k; + if (!ParseInternalKey(iter->key(), &k)) { + fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); + } else { + fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); + } + } +} +#endif + +namespace { + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter: public Iterator { + public: + DBIter(const std::string* dbname, Env* env, + const Comparator* cmp, Iterator* iter, SequenceNumber s) + : dbname_(dbname), + env_(env), + user_comparator_(cmp), + iter_(iter), + sequence_(s), + large_(NULL), + valid_(false) { + } + virtual ~DBIter() { + delete iter_; + delete large_; + } + virtual bool Valid() const { return valid_; } + virtual Slice key() const { + assert(valid_); + return key_; + } + virtual Slice value() const { + assert(valid_); + if (large_ == NULL) { + return value_; + } else { + MutexLock l(&large_->mutex); + if (!large_->produced) { + ReadIndirectValue(); + } + return large_->value; + } + } + + virtual void Next() { + assert(valid_); + // iter_ is already positioned past DBIter::key() + FindNextUserEntry(); + } + + virtual void Prev() { + assert(valid_); + bool ignored; + ScanUntilBeforeCurrentKey(&ignored); + FindPrevUserEntry(); + } + + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek); + std::string tmp; + AppendInternalKey(&tmp, ikey); + iter_->Seek(tmp); + FindNextUserEntry(); + } + virtual void SeekToFirst() { + iter_->SeekToFirst(); + FindNextUserEntry(); + } + + virtual void SeekToLast(); + + virtual Status status() const { + if (status_.ok()) { + if (large_ != NULL && !large_->status.ok()) return large_->status; + return iter_->status(); + } else { + return status_; + } + } + + private: + void FindNextUserEntry(); + void FindPrevUserEntry(); + void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); } + void SaveValue(const Slice& v) { + if (value_.capacity() > v.size() + 1048576) { + std::string empty; + swap(empty, value_); + } + value_.assign(v.data(), v.size()); + } + bool ParseKey(ParsedInternalKey* key); + void SkipPast(const Slice& k); + void ScanUntilBeforeCurrentKey(bool* found_live); + + void ReadIndirectValue() const; + + struct Large { + port::Mutex mutex; + std::string value; + bool produced; + Status status; + }; + + const std::string* const dbname_; + Env* const env_; + + const Comparator* const user_comparator_; + + // iter_ is positioned just past current entry for DBIter if valid_ + Iterator* const iter_; + + SequenceNumber const sequence_; + Status status_; + std::string key_; // Always a user key + std::string value_; + Large* large_; // Non-NULL if value is an indirect reference + bool valid_; + + // No copying allowed + DBIter(const DBIter&); + void operator=(const DBIter&); +}; + +inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { + if (!ParseInternalKey(iter_->key(), ikey)) { + status_ = Status::Corruption("corrupted internal key in DBIter"); + return false; + } else { + return true; + } +} + +void DBIter::FindNextUserEntry() { + if (large_ != NULL) { + if (status_.ok() && !large_->status.ok()) { + status_ = large_->status; + } + delete large_; + large_ = NULL; + } + while (iter_->Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + // Skip past corrupted entry + iter_->Next(); + continue; + } + if (ikey.sequence > sequence_) { + // Ignore entries newer than the snapshot + iter_->Next(); + continue; + } + + switch (ikey.type) { + case kTypeDeletion: + SaveKey(ikey.user_key); // Make local copy for use by SkipPast() + iter_->Next(); + SkipPast(key_); + // Do not return deleted entries. Instead keep looping. + break; + + case kTypeValue: + SaveKey(ikey.user_key); + SaveValue(iter_->value()); + iter_->Next(); + SkipPast(key_); + // Yield the value we just found. + valid_ = true; + return; + + case kTypeLargeValueRef: + SaveKey(ikey.user_key); + // Save the large value ref as value_, and read it lazily on a call + // to value() + SaveValue(iter_->value()); + large_ = new Large; + large_->produced = false; + iter_->Next(); + SkipPast(key_); + // Yield the value we just found. + valid_ = true; + return; + } + } + valid_ = false; + key_.clear(); + value_.clear(); + assert(large_ == NULL); +} + +void DBIter::SkipPast(const Slice& k) { + while (iter_->Valid()) { + ParsedInternalKey ikey; + // Note that if we cannot parse an internal key, we keep looping + // so that if we have a run like the following: + // => value100 + // + // => value50 + // we will skip over the corrupted entry as well as value50. + if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) { + break; + } + iter_->Next(); + } +} + +void DBIter::SeekToLast() { + // Position iter_ at the last uncorrupted user key and then + // let FindPrevUserEntry() do the heavy lifting to find + // a user key that is live. + iter_->SeekToLast(); + ParsedInternalKey current; + while (iter_->Valid() && !ParseKey(¤t)) { + iter_->Prev(); + } + if (iter_->Valid()) { + SaveKey(current.user_key); + } + FindPrevUserEntry(); +} + +// Let X be the user key at which iter_ is currently positioned. +// Adjust DBIter to point at the last entry with a key <= X that +// has a live value. +void DBIter::FindPrevUserEntry() { + // Consider the following example: + // + // A@540 + // A@400 + // + // B@300 + // B@200 + // B@100 <- iter_ + // + // C@301 + // C@201 + // + // The comments marked "(first iteration)" below relate what happens + // for the preceding example in the first iteration of the while loop + // below. There may be more than one iteration either if there are + // no live values for B, or if there is a corruption. + while (iter_->Valid()) { + std::string saved = key_; + bool found_live; + ScanUntilBeforeCurrentKey(&found_live); + // (first iteration) iter_ at A@400 + if (found_live) { + // Step forward into range of entries with user key >= saved + if (!iter_->Valid()) { + iter_->SeekToFirst(); + } else { + iter_->Next(); + } + // (first iteration) iter_ at B@300 + + FindNextUserEntry(); // Sets key_ to the key of the next value it found + if (valid_ && user_comparator_->Compare(key_, saved) == 0) { + // (first iteration) iter_ at C@301 + return; + } + + // FindNextUserEntry() could not find any entries under the + // user key "saved". This is probably a corruption since + // ScanUntilBefore(saved) found a live value. So we skip + // backwards to an earlier key and ignore the corrupted + // entries for "saved". + // + // (first iteration) iter_ at C@301 and saved == "B" + key_ = saved; + bool ignored; + ScanUntilBeforeCurrentKey(&ignored); + // (first iteration) iter_ at A@400 + } + } + valid_ = false; + key_.clear(); + value_.clear(); +} + +void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) { + *found_live = false; + if (!iter_->Valid()) { + iter_->SeekToLast(); + } + + while (iter_->Valid()) { + ParsedInternalKey current; + if (!ParseKey(¤t)) { + iter_->Prev(); + continue; + } + + if (current.sequence > sequence_) { + // Ignore entries that are serialized after this read + iter_->Prev(); + continue; + } + + const int cmp = user_comparator_->Compare(current.user_key, key_); + if (cmp < 0) { + SaveKey(current.user_key); + return; + } else if (cmp == 0) { + switch (current.type) { + case kTypeDeletion: + *found_live = false; + break; + + case kTypeValue: + case kTypeLargeValueRef: + *found_live = true; + break; + } + } else { // cmp > 0 + *found_live = false; + } + + iter_->Prev(); + } +} + +void DBIter::ReadIndirectValue() const { + assert(!large_->produced); + large_->produced = true; + LargeValueRef large_ref; + if (value_.size() != LargeValueRef::ByteSize()) { + large_->status = Status::Corruption("malformed large value reference"); + return; + } + memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize()); + std::string fname = LargeValueFileName(*dbname_, large_ref); + RandomAccessFile* file; + Status s = env_->NewRandomAccessFile(fname, &file); + if (s.ok()) { + uint64_t file_size = file->Size(); + uint64_t value_size = large_ref.ValueSize(); + large_->value.resize(value_size); + Slice result; + s = file->Read(0, file_size, &result, + const_cast(large_->value.data())); + if (s.ok()) { + if (result.size() == file_size) { + switch (large_ref.compression_type()) { + case kNoCompression: { + if (result.data() != large_->value.data()) { + large_->value.assign(result.data(), result.size()); + } + break; + } + case kLightweightCompression: { + std::string uncompressed; + if (port::Lightweight_Uncompress(result.data(), result.size(), + &uncompressed) && + uncompressed.size() == large_ref.ValueSize()) { + swap(uncompressed, large_->value); + } else { + s = Status::Corruption( + "Unable to read entire compressed large value file"); + } + } + } + } else { + s = Status::Corruption("Unable to read entire large value file"); + } + } + delete file; // Ignore errors on closing + } + if (!s.ok()) { + large_->value.clear(); + large_->status = s; + } +} + +} // anonymous namespace + +Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence) { + return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); +} + +} diff --git a/db/db_iter.h b/db/db_iter.h new file mode 100644 index 0000000..a0be50e --- /dev/null +++ b/db/db_iter.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ +#define STORAGE_LEVELDB_DB_DB_ITER_H_ + +#include +#include "include/db.h" +#include "db/dbformat.h" + +namespace leveldb { + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified "sequence" number +// into appropriate user keys. +extern Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence); + +} + +#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/db/db_test.cc b/db/db_test.cc new file mode 100644 index 0000000..895a5e1 --- /dev/null +++ b/db/db_test.cc @@ -0,0 +1,963 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/db.h" + +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "include/env.h" +#include "include/table.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +class DBTest { + public: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + DBTest() : env_(Env::Default()) { + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, Options()); + db_ = NULL; + Reopen(); + } + + ~DBTest() { + delete db_; + DestroyDB(dbname_, Options()); + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); + } + + void DestroyAndReopen(Options* options = NULL) { + delete db_; + db_ = NULL; + DestroyDB(dbname_, Options()); + ASSERT_OK(TryReopen(options)); + } + + Status TryReopen(Options* options) { + delete db_; + db_ = NULL; + Options opts; + if (options != NULL) { + opts = *options; + } else { + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const std::string& k, const std::string& v) { + WriteBatch batch; + batch.Put(k, v); + return db_->Write(WriteOptions(), &batch); + } + + Status Delete(const std::string& k) { + WriteBatch batch; + batch.Delete(k); + return db_->Write(WriteOptions(), &batch); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string AllEntriesFor(const Slice& user_key) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (last_options_.comparator->Compare( + ikey.user_key, user_key) != 0) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeLargeValueRef: + result += "LARGEVALUE(" + EscapeString(iter->value()) + ")"; + break; + case kTypeDeletion: + result += "DEL"; + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + delete iter; + return result; + } + + int NumTableFilesAtLevel(int level) { + uint64_t val; + ASSERT_TRUE( + db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), + &val)); + return val; + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + std::set LargeValueFiles() const { + // Return the set of large value files that exist in the database + std::vector filenames; + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + uint64_t number; + LargeValueRef large_ref; + FileType type; + std::set live; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type) && + type == kLargeValueFile) { + fprintf(stderr, " live: %s\n", + LargeValueRefToFilenameString(large_ref).c_str()); + live.insert(large_ref); + } + } + fprintf(stderr, "Found %d live large value files\n", (int)live.size()); + return live; + } +}; + +TEST(DBTest, Empty) { + ASSERT_TRUE(db_ != NULL); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(DBTest, ReadWrite) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); +} + +TEST(DBTest, PutDeleteGet) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(db_->Delete(WriteOptions(), "foo")); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(DBTest, Recover) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v5", Get("baz")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + + Reopen(); + ASSERT_EQ("v3", Get("foo")); + ASSERT_OK(Put("foo", "v4")); + ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v5", Get("baz")); +} + +TEST(DBTest, RecoveryWithEmptyLog) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + Reopen(); + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); +} + +TEST(DBTest, MinorCompactionsHappen) { + Options options; + options.write_buffer_size = 10000; + Reopen(&options); + + const int N = 100; + + int starting_num_tables = NumTableFilesAtLevel(0); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = NumTableFilesAtLevel(0); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } + + Reopen(); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } +} + +TEST(DBTest, RecoverWithLargeLog) { + { + Options options; + options.large_value_threshold = 1048576; + Reopen(&options); + ASSERT_OK(Put("big1", std::string(200000, '1'))); + ASSERT_OK(Put("big2", std::string(200000, '2'))); + ASSERT_OK(Put("small3", std::string(10, '3'))); + ASSERT_OK(Put("small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + options.large_value_threshold = 1048576; + Reopen(&options); + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + ASSERT_EQ(std::string(200000, '1'), Get("big1")); + ASSERT_EQ(std::string(200000, '2'), Get("big2")); + ASSERT_EQ(std::string(10, '3'), Get("small3")); + ASSERT_EQ(std::string(10, '4'), Get("small4")); + ASSERT_GT(NumTableFilesAtLevel(0), 1); +} + +TEST(DBTest, CompactionsGenerateMultipleFiles) { + Options options; + options.write_buffer_size = 100000000; // Large write buffer + options.large_value_threshold = 1048576; + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to level-0 + Reopen(&options); + dbfull()->TEST_CompactRange(0, "", Key(100000)); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST(DBTest, ApproximateSizes) { + for (int test = 0; test < 2; test++) { + // test==0: default large_value_threshold + // test==1: 1 MB large_value_threshold + Options options; + options.large_value_threshold = (test == 0) ? 65536 : 1048576; + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + DestroyAndReopen(); + + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + Reopen(&options); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + const int N = 80; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); + } + if (test == 1) { + // 0 because GetApproximateSizes() does not account for memtable space for + // non-large values + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + } else { + ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000)); + ASSERT_TRUE(Between(Size(Key(20), Key(30)), + 100000*10, 100000*10 + 10000)); + } + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); + ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), + 100000 * (i+1), 100000 * (i+1) + 10000)); + ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), + 100000 * 10, 100000 * 10 + 10000)); + } + ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); + ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); + + dbfull()->TEST_CompactRange(0, + Key(compact_start), + Key(compact_start + 9)); + } + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + } + } +} + +TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + Options options; + options.large_value_threshold = 65536; + options.compression = kNoCompression; + Reopen(); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(2), big1)); + ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(4), big1)); + ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, Key(0), Key(100)); + } +} + +TEST(DBTest, IteratorPinsRef) { + Put("foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Write to force compactions + Put("foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values + } + Put("foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; +} + +TEST(DBTest, Snapshot) { + Put("foo", "v1"); + const Snapshot* s1 = db_->GetSnapshot(); + Put("foo", "v2"); + const Snapshot* s2 = db_->GetSnapshot(); + Put("foo", "v3"); + const Snapshot* s3 = db_->GetSnapshot(); + + Put("foo", "v4"); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v3", Get("foo", s3)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s3); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ("v4", Get("foo")); +} + +TEST(DBTest, HiddenValuesAreRemoved) { + Random rnd(301); + std::string big = RandomString(&rnd, 50000); + Put("foo", big); + Put("pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "tiny"); + Put("pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + + ASSERT_EQ(big, Get("foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); + dbfull()->TEST_CompactRange(0, "", "x"); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GE(NumTableFilesAtLevel(1), 1); + dbfull()->TEST_CompactRange(1, "", "x"); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + + ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); +} + +TEST(DBTest, DeletionMarkers1) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + Delete("foo"); + Put("foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + dbfull()->TEST_CompactRange(0, "", "z"); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(1, "", "z"); + // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. + // (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); +} + +TEST(DBTest, DeletionMarkers2) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(0, "", "z"); + // DEL kept: L2 file overlaps + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(1, "", "z"); + // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. + // (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); +} + +TEST(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "leveldb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + NewComparator cmp; + Options new_options; + new_options.comparator = &cmp; + Status s = TryReopen(&new_options); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); +} + +static bool LargeValuesOK(DBTest* db, + const std::set& expected) { + std::set actual = db->LargeValueFiles(); + if (actual.size() != expected.size()) { + fprintf(stderr, "Sets differ in size: %d vs %d\n", + (int)actual.size(), (int)expected.size()); + return false; + } + for (std::set::const_iterator it = expected.begin(); + it != expected.end(); + ++it) { + if (actual.count(*it) != 1) { + fprintf(stderr, " key '%s' not found in actual set\n", + LargeValueRefToFilenameString(*it).c_str()); + return false; + } + } + return true; +} + +TEST(DBTest, LargeValues1) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + + std::string big1; + test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible + std::set expected; + + ASSERT_OK(Put("big1", big1)); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Delete("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + // No handling of deletion markers on memtable compactions, so big1 remains + ASSERT_TRUE(LargeValuesOK(this, expected)); + + dbfull()->TEST_CompactRange(0, "", "z"); + expected.erase(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + +TEST(DBTest, LargeValues2) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + + std::string big1, big2; + test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible + test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible + std::set expected; + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big1", big1)); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_EQ(big1, Get("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big2", big2)); + ASSERT_EQ(big2, Get("big2")); +#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM) + // TODO(sanjay) Reenable after compression support is added + expected.insert(LargeValueRef::Make(big2, kNoCompression)); +#else + expected.insert(LargeValueRef::Make(big2, kLightweightCompression)); +#endif + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big2", big2)); + ASSERT_OK(Put("big2_b", big2)); + ASSERT_EQ(big1, Get("big1")); + ASSERT_EQ(big2, Get("big2")); + ASSERT_EQ(big2, Get("big2_b")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Delete("big1")); + ASSERT_EQ("NOT_FOUND", Get("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(0, "", "z"); + expected.erase(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(1, "", "z"); + + ASSERT_OK(Delete("big2")); + ASSERT_EQ("NOT_FOUND", Get("big2")); + ASSERT_EQ(big2, Get("big2_b")); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + // Make sure the large value refs survive a reload and compactions after + // the reload. + Reopen(); + ASSERT_TRUE(LargeValuesOK(this, expected)); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + +TEST(DBTest, LargeValues3) { + // Make sure we don't compress values if + Options options; + options.large_value_threshold = 10000; + options.compression = kNoCompression; + Reopen(&options); + + Random rnd(301); + + std::string big1 = std::string(100000, 'x'); // Very compressible + std::set expected; + + ASSERT_OK(Put("big1", big1)); + ASSERT_EQ(big1, Get("big1")); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + + +TEST(DBTest, DBOpen_Options) { + std::string dbname = test::TmpDir() + "/db_options_test"; + DestroyDB(dbname, Options()); + + // Does not exist, and create_if_missing == false: error + DB* db = NULL; + Options opts; + opts.create_if_missing = false; + Status s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); + ASSERT_TRUE(db == NULL); + + // Does not exist, and create_if_missing == true: OK + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; + + // Does exist, and error_if_exists == true: error + opts.create_if_missing = false; + opts.error_if_exists = true; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); + ASSERT_TRUE(db == NULL); + + // Does exist, and error_if_exists == false: OK + opts.create_if_missing = true; + opts.error_if_exists = false; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; +} + +class ModelDB: public DB { + public: + explicit ModelDB(const Options& options): options_(options) { } + ~ModelDB() { } + virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { + return DB::Put(o, k, v); + } + virtual Status Delete(const WriteOptions& o, const Slice& key) { + return DB::Delete(o, key); + } + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) { + assert(false); // Not implemented + return Status::NotFound(key); + } + virtual Iterator* NewIterator(const ReadOptions& options) { + if (options.snapshot == NULL) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + reinterpret_cast(options.snapshot->number_); + return new ModelIter(snapshot_state, false); + } + } + virtual const Snapshot* GetSnapshot() { + KVMap* saved = new KVMap; + *saved = map_; + return snapshots_.New( + reinterpret_cast(saved)); + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) { + const KVMap* saved = reinterpret_cast(snapshot->number_); + delete saved; + snapshots_.Delete(snapshot); + } + virtual Status Write(const WriteOptions& options, WriteBatch* batch) { + assert(options.post_write_snapshot == NULL); // Not supported + for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeValue: + map_[it.key().ToString()] = it.value().ToString(); + break; + case kTypeLargeValueRef: + assert(false); // Should not occur + break; + case kTypeDeletion: + map_.erase(it.key().ToString()); + break; + } + } + return Status::OK(); + } + + virtual bool GetProperty(const Slice& property, uint64_t* value) { + return false; + } + virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + private: + typedef std::map KVMap; + class ModelIter: public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) { + } + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() { ++iter_; } + virtual void Prev() { --iter_; } + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + SnapshotList snapshots_; +}; + +static std::string RandomKey(Random* rnd) { + int len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, + DB* model, + DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); + miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); + delete miter; + delete dbiter; + return ok; +} + +TEST(DBTest, Randomized) { + Random rnd(test::RandomSeed()); + ModelDB model(last_options_); + const int N = 10000; + const Snapshot* model_snap = NULL; + const Snapshot* db_snap = NULL; + std::string k, v; + for (int step = 0; step < N; step++) { + if (step % 100 == 0) { + fprintf(stderr, "Step %d of %d\n", step, N); + } + int p = rnd.Uniform(100); + if (p < 45) { // Put + k = RandomKey(&rnd); + v = RandomString(&rnd, + rnd.OneIn(20) + ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + + } else if (p < 90) { // Delete + k = RandomKey(&rnd); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + + + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); + + Reopen(); + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/dbformat.cc b/db/dbformat.cc new file mode 100644 index 0000000..f09a729 --- /dev/null +++ b/db/dbformat.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "db/dbformat.h" +#include "port/port.h" +#include "util/coding.h" + +namespace leveldb { + +static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(t <= kValueTypeForSeek); + return (seq << 8) | t; +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +std::string ParsedInternalKey::DebugString() const { + char buf[50]; + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); + std::string result = "'"; + result += user_key.ToString(); + result += buf; + return result; +} + +const char* InternalKeyComparator::Name() const { + return "leveldb.InternalKeyComparator"; +} + +int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +void InternalKeyComparator::FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + user_comparator_->FindShortestSeparator(&tmp, user_limit); + if (user_comparator_->Compare(*start, tmp) < 0) { + // User key has become larger. Tack on the earliest possible + // number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*start, tmp) < 0); + assert(this->Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void InternalKeyComparator::FindShortSuccessor(std::string* key) const { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + user_comparator_->FindShortSuccessor(&tmp); + if (user_comparator_->Compare(user_key, tmp) < 0) { + // User key has become larger. Tack on the earliest possible + // number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) { + LargeValueRef result; + port::SHA1_Hash(value.data(), value.size(), &result.data[0]); + EncodeFixed64(&result.data[20], value.size()); + result.data[28] = static_cast(ctype); + return result; +} + +std::string LargeValueRefToFilenameString(const LargeValueRef& h) { + assert(sizeof(h.data) == LargeValueRef::ByteSize()); + assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf + static const char tohex[] = "0123456789abcdef"; + char buf[20*2]; + for (int i = 0; i < 20; i++) { + buf[2*i] = tohex[(h.data[i] >> 4) & 0xf]; + buf[2*i+1] = tohex[h.data[i] & 0xf]; + } + std::string result = std::string(buf, sizeof(buf)); + result += "-"; + result += NumberToString(h.ValueSize()); + result += "-"; + result += NumberToString(static_cast(h.compression_type())); + return result; +} + +static uint32_t hexvalue(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } else if (c >= 'A' && c <= 'F') { + return 10 + c - 'A'; + } else { + assert(c >= 'a' && c <= 'f'); + return 10 + c - 'a'; + } +} + +bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) { + Slice in = s; + if (in.size() < 40) { + return false; + } + for (int i = 0; i < 20; i++) { + if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) { + return false; + } + unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]); + h->data[i] = c; + } + in.remove_prefix(40); + uint64_t value_size, ctype; + + if (ConsumeChar(&in, '-') && + ConsumeDecimalNumber(&in, &value_size) && + ConsumeChar(&in, '-') && + ConsumeDecimalNumber(&in, &ctype) && + in.empty() && + (ctype <= kLightweightCompression)) { + EncodeFixed64(&h->data[20], value_size); + h->data[28] = static_cast(ctype); + return true; + } else { + return false; + } +} + +} diff --git a/db/dbformat.h b/db/dbformat.h new file mode 100644 index 0000000..e784457 --- /dev/null +++ b/db/dbformat.h @@ -0,0 +1,198 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ +#define STORAGE_LEVELDB_DB_FORMAT_H_ + +#include +#include "include/comparator.h" +#include "include/db.h" +#include "include/slice.h" +#include "include/table_builder.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +enum ValueType { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeLargeValueRef = 0x2, +}; +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeLargeValueRef; + +typedef uint64_t SequenceNumber; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString() const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte +// uncompressed size, and a 1 byte CompressionType code. An +// encoded form of it is embedded in the filenames of large value +// files stored in the database, and the raw binary form is stored as +// the iter->value() result for values of type kTypeLargeValueRef in +// the table and log files that make up the database. +struct LargeValueRef { + char data[29]; + + // Initialize a large value ref for the given data + static LargeValueRef Make(const Slice& data, + CompressionType compression_type); + + // Initialize a large value ref from a serialized, 29-byte reference value + static LargeValueRef FromRef(const Slice& ref) { + LargeValueRef result; + assert(ref.size() == sizeof(result.data)); + memcpy(result.data, ref.data(), sizeof(result.data)); + return result; + } + + // Return the number of bytes in a LargeValueRef (not the + // number of bytes in the value referenced). + static size_t ByteSize() { return sizeof(LargeValueRef().data); } + + // Return the number of bytes in the value referenced by "*this". + uint64_t ValueSize() const { return DecodeFixed64(&data[20]); } + + CompressionType compression_type() const { + return static_cast(data[28]); + } + + bool operator==(const LargeValueRef& b) const { + return memcmp(data, b.data, sizeof(data)) == 0; + } + bool operator<(const LargeValueRef& b) const { + return memcmp(data, b.data, sizeof(data)) < 0; + } +}; + +// Convert the large value ref to a human-readable string suitable +// for embedding in a large value filename. +extern std::string LargeValueRefToFilenameString(const LargeValueRef& h); + +// Parse the large value filename string in "input" and store it in +// "*h". If successful, returns true. Otherwise returns false. +extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref); + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast(kTypeLargeValueRef)); +} + +} + +#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc new file mode 100644 index 0000000..5dfa101 --- /dev/null +++ b/db/dbformat_test.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +static std::string IKey(const std::string& user_key, + uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); + return result; +} + +static void TestKey(const std::string& key, + uint64_t seq, + ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); +} + +class FormatTest { }; + +TEST(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; + const uint64_t seq[] = { + 1, 2, 3, + (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, + (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, + (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 + }; + for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeDeletion))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeLargeValueRef))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), + IKey("hello", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ(IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), + IKey("foo", 200, kTypeValue))); +} + +TEST(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +TEST(FormatTest, SHA1) { + // Check that we are computing the same value as sha1. + // Note that the last two numbers are the length of the input and the + // compression type. + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr + LargeValueRefToFilenameString( + LargeValueRef::Make("hello", kNoCompression))); + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr + LargeValueRefToFilenameString( + LargeValueRef::Make("hello", kLightweightCompression))); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/filename.cc b/db/filename.cc new file mode 100644 index 0000000..55e6d28 --- /dev/null +++ b/db/filename.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "db/filename.h" +#include "db/dbformat.h" +#include "include/env.h" +#include "util/logging.h" + +namespace leveldb { + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast(number), + suffix); + return name + buf; +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string TableFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "sst"); +} + +std::string LargeValueFileName(const std::string& name, + const LargeValueRef& large_ref) { + std::string result = name + "/"; + result += LargeValueRefToFilenameString(large_ref); + result += ".val"; + return result; +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + static_cast(number)); + return dbname + buf; +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/CURRENT"; +} + +std::string LockFileName(const std::string& dbname) { + return dbname + "/LOCK"; +} + +std::string TempFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + return MakeFileName(dbname, number, "dbtmp"); +} + +std::string InfoLogFileName(const std::string& dbname) { + return dbname + "/LOG"; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname) { + return dbname + "/LOG.old"; +} + + +// Owned filenames have the form: +// dbname/CURRENT +// dbname/LOCK +// dbname/LOG +// dbname/LOG.old +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val +// dbname/[0-9]+.(log|sst) +bool ParseFileName(const std::string& fname, + uint64_t* number, + LargeValueRef* large_ref, + FileType* type) { + Slice rest(fname); + if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (rest == "LOG" || rest == "LOG.old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.size() >= 4 && + Slice(rest.data() + rest.size() - 4, 4) == ".val") { + LargeValueRef h; + if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4), + &h)) { + return false; + } + *large_ref = h; + *type = kLargeValueFile; + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + Slice suffix = rest; + if (suffix == Slice(".log")) { + *type = kLogFile; + } else if (suffix == Slice(".sst")) { + *type = kTableFile; + } else if (suffix == Slice(".dbtmp")) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); + if (s.ok()) { + s = env->RenameFile(tmp, CurrentFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +} diff --git a/db/filename.h b/db/filename.h new file mode 100644 index 0000000..3fd2ea4 --- /dev/null +++ b/db/filename.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ +#define STORAGE_LEVELDB_DB_FILENAME_H_ + +#include +#include +#include "include/slice.h" +#include "include/status.h" +#include "port/port.h" + +namespace leveldb { + +class Env; +struct LargeValueRef; + +enum FileType { + kLogFile, + kDBLockFile, + kTableFile, + kLargeValueFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one +}; + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::string& dbname, uint64_t number); + +// Return the name of the large value file with the specified large +// value reference in the db named by "dbname". The result will be +// prefixed with "dbname". +extern std::string LargeValueFileName(const std::string& dbname, + const LargeValueRef& large_ref); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname); + +// If filename is a leveldb file, store the type of the file in *type. +// If *type is kLargeValueFile, then the large value reference data +// from the filename is stored in "*large_ref. For all other types of +// files, the number encoded in the filename is stored in *number. If +// the filename was successfully parsed, returns true. Else return +// false. +extern bool ParseFileName(const std::string& filename, + uint64_t* number, + LargeValueRef* large_ref, + FileType* type); + +// Make the CURRENT file point to the descriptor file with the +// specified number. +extern Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number); + + +} + +#endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/db/filename_test.cc b/db/filename_test.cc new file mode 100644 index 0000000..08a54eb --- /dev/null +++ b/db/filename_test.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +class FileNameTest { }; + +TEST(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + LargeValueRef large_ref; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + const char* large_ref; + FileType type; + } cases[] = { + { "100.log", 100, "", kLogFile }, + { "0.log", 0, "", kLogFile }, + { "0.sst", 0, "", kTableFile }, + { "CURRENT", 0, "", kCurrentFile }, + { "LOCK", 0, "", kDBLockFile }, + { "MANIFEST-2", 2, "", kDescriptorFile }, + { "MANIFEST-7", 7, "", kDescriptorFile }, + { "LOG", 0, "", kInfoLogFile }, + { "LOG.old", 0, "", kInfoLogFile }, + { "18446744073709551615.log", 18446744073709551615ull, "", + kLogFile }, + { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0, + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile }, + { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0, + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0", + kLargeValueFile }, + }; + for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + std::string f = cases[i].fname; + ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f; + ASSERT_EQ(cases[i].type, type) << f; + if (type == kLargeValueFile) { + ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref)) + << f; + } else { + ASSERT_EQ(cases[i].number, number) << f; + } + } + + // Errors + static const char* errors[] = { + "", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop", + "100.val", + ".val", + "123456789012345678901234567890123456789-12340.val", + "1234567890123456789012345678901234567-123-0.val", + "12345678901234567890123456789012345678902-100-1-.val", + // Overflow on value size + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val", + // '03.val' is a bad compression type + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" }; + for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f; + }; +} + +TEST(FileNameTest, Construction) { + uint64_t number; + FileType type; + LargeValueRef large_ref; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(0, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(0, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(192, number); + ASSERT_EQ(kLogFile, type); + + fname = TableFileName("bar", 200); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(200, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(100, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(999, number); + ASSERT_EQ(kTempFile, type); + + for (int i = 0; i <= kLightweightCompression; i++) { + CompressionType ctype = static_cast(i); + std::string value = "abcdef"; + LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype); + fname = LargeValueFileName("tmp", real_large_ref); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(real_large_ref == large_ref); + ASSERT_EQ(kLargeValueFile, type); + ASSERT_EQ(large_ref.compression_type(), ctype); + } +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/log_format.h b/db/log_format.h new file mode 100644 index 0000000..137cd4a --- /dev/null +++ b/db/log_format.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ +#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ + +namespace leveldb { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4, +}; +static const int kMaxRecordType = kLastType; + +static const int kBlockSize = 32768; + +// Header is checksum (4 bytes), type (1 byte), length (2 bytes). +static const int kHeaderSize = 4 + 1 + 2; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/db/log_reader.cc b/db/log_reader.cc new file mode 100644 index 0000000..243bd2c --- /dev/null +++ b/db/log_reader.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include +#include "include/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { +namespace log { + +Reader::Reporter::~Reporter() { +} + +Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) + : file_(file), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false) { +} + +Reader::~Reader() { + delete[] backing_store_; +} + +bool Reader::ReadRecord(Slice* record, std::string* scratch) { + scratch->clear(); + record->clear(); + bool in_fragmented_record = false; + + Slice fragment; + while (true) { + switch (ReadPhysicalRecord(&fragment)) { + case kFullType: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + } + scratch->clear(); + *record = fragment; + return true; + + case kFirstType: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + } + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + if (!in_fragmented_record) { + ReportDrop(fragment.size(), "missing start of fragmented record"); + } else { + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + if (!in_fragmented_record) { + ReportDrop(fragment.size(), "missing start of fragmented record"); + } else { + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + return true; + } + break; + + case kEof: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + scratch->clear(); + } + return false; + + case kBadRecord: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: + ReportDrop( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + "unknown record type"); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + return false; +} + +void Reader::ReportDrop(size_t bytes, const char* reason) { + if (reporter_ != NULL) { + reporter_->Corruption(bytes, Status::Corruption(reason)); + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result) { + while (true) { + if (buffer_.size() <= kHeaderSize) { + if (!eof_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + if (!status.ok()) { + if (reporter_ != NULL) { + reporter_->Corruption(kBlockSize, status); + } + buffer_.clear(); + eof_ = true; + return kEof; + } else if (buffer_.size() < kBlockSize) { + eof_ = true; + } + continue; + } else if (buffer_.size() == 0) { + // End of file + return kEof; + } else if (buffer_.size() < kHeaderSize) { + ReportDrop(buffer_.size(), "truncated record at end of file"); + buffer_.clear(); + return kEof; + } else { + // We have a trailing zero-length record. Fall through and check it. + } + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + if (kHeaderSize + length > buffer_.size()) { + ReportDrop(buffer_.size(), "bad record length"); + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + if (type == kZeroType && length == 0) { + // Skip zero length record + buffer_.remove_prefix(kHeaderSize + length); + return kBadRecord; + } + + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + if (actual_crc != expected_crc) { + ReportDrop(length, "checksum mismatch"); + buffer_.remove_prefix(kHeaderSize + length); + return kBadRecord; + } + } + + buffer_.remove_prefix(kHeaderSize + length); + *result = Slice(header + kHeaderSize, length); + return type; + } +} + +} +} diff --git a/db/log_reader.h b/db/log_reader.h new file mode 100644 index 0000000..515d2af --- /dev/null +++ b/db/log_reader.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ +#define STORAGE_LEVELDB_DB_LOG_READER_H_ + +#include "db/log_format.h" +#include "include/slice.h" +#include "include/status.h" + +namespace leveldb { + +class SequentialFile; + +namespace log { + +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-NULL, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + Reader(SequentialFile* file, Reporter* reporter, bool checksum); + + ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + bool ReadRecord(Slice* record, std::string* scratch); + + private: + SequentialFile* const file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + kBadRecord = kMaxRecordType + 2 + }; + + // Return type, or one of the preceding special values + unsigned int ReadPhysicalRecord(Slice* result); + void ReportDrop(size_t bytes, const char* reason); + + // No copying allowed + Reader(const Reader&); + void operator=(const Reader&); +}; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/db/log_test.cc b/db/log_test.cc new file mode 100644 index 0000000..8c1915d --- /dev/null +++ b/db/log_test.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "include/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +class LogTest { + private: + class StringDest : public WritableFile { + public: + std::string contents_; + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + virtual Status Append(const Slice& slice) { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + }; + + class StringSource : public SequentialFile { + public: + Slice contents_; + bool force_error_; + bool returned_partial_; + StringSource() : force_error_(false), returned_partial_(false) { } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + ASSERT_EQ(kBlockSize, n); + + if (force_error_) { + force_error_ = false; + returned_partial_ = true; + return Status::Corruption("read error"); + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + *result = Slice(contents_.data(), n); + contents_.remove_prefix(n); + return Status::OK(); + } + }; + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) { } + virtual void Corruption(size_t bytes, const Status& status) { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + StringDest dest_; + StringSource source_; + ReportCollector report_; + bool reading_; + Writer writer_; + Reader reader_; + + public: + LogTest() : reading_(false), + writer_(&dest_), + reader_(&source_, &report_, true/*checksum*/) { + } + + void Write(const std::string& msg) { + ASSERT_TRUE(!reading_) << "Write() after starting to read"; + writer_.AddRecord(Slice(msg)); + } + + size_t WrittenBytes() const { + return dest_.contents_.size(); + } + + std::string Read() { + if (!reading_) { + reading_ = true; + source_.contents_ = Slice(dest_.contents_); + } + std::string scratch; + Slice record; + if (reader_.ReadRecord(&record, &scratch)) { + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, int delta) { + dest_.contents_[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_.contents_[offset] = new_byte; + } + + void ShrinkSize(int bytes) { + dest_.contents_.resize(dest_.contents_.size() - bytes); + } + + void FixChecksum(int header_offset, int len) { + // Compute crc of type/len/data + uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_.contents_[header_offset], crc); + } + + void ForceError() { + source_.force_error_ = true; + } + + size_t DroppedBytes() const { + return report_.dropped_bytes_; + } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } +}; + +TEST(LogTest, Empty) { + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ShortTrailer) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, AlignedEof) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST(LogTest, TruncatedTrailingRecord) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); + ASSERT_EQ("OK", MatchError("truncated record at end of file")); +} + +TEST(LogTest, BadLength) { + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); +} + +TEST(LogTest, ChecksumMismatch) { + Write("foo"); + IncrementByte(0, 10); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); +} + +TEST(LogTest, UnexpectedMiddleType) { + Write("foo"); + SetByte(6, kMiddleType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedLastType) { + Write("foo"); + SetByte(6, kLastType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + const int dropped = DroppedBytes(); + ASSERT_LE(dropped, 2*kBlockSize + 100); + ASSERT_GE(dropped, 2*kBlockSize); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/log_writer.cc b/db/log_writer.cc new file mode 100644 index 0000000..465eca2 --- /dev/null +++ b/db/log_writer.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include +#include "include/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { +namespace log { + +Writer::Writer(WritableFile* dest) + : dest_(dest), + block_offset_(0) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { +} + +Status Writer::AddRecord(const Slice& slice) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + Status s; + do { + const int leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover <= kHeaderSize) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer + dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover)); + } + block_offset_ = 0; + } + + // Invariant: we never leave <= kHeaderSize bytes in a block. + const int avail = kBlockSize - block_offset_ - kHeaderSize; + assert(avail > 0); + + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool begin = (ptr == slice.data()); + const bool end = (left == fragment_length); + if (begin && end) { + type = kFullType; + } else if (begin) { + type = kFirstType; + } else if (end) { + type = kLastType; + } else { + type = kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + } while (s.ok() && left > 0); + return s; +} + +Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { + assert(n <= 0xffff); // Must fit in two bytes + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + // Format the header + char buf[kHeaderSize]; + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + // Compute the crc of the record type and the payload. + uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage + EncodeFixed32(buf, crc); + + // Write the header and the payload + Status s = dest_->Append(Slice(buf, kHeaderSize)); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n)); + if (s.ok()) { + s = dest_->Flush(); + } + } + block_offset_ += kHeaderSize + n; + return s; +} + +} +} diff --git a/db/log_writer.h b/db/log_writer.h new file mode 100644 index 0000000..13c64ba --- /dev/null +++ b/db/log_writer.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ +#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ + +#include +#include "db/log_format.h" +#include "include/slice.h" +#include "include/status.h" + +namespace leveldb { + +class WritableFile; + +namespace log { + +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(WritableFile* dest); + ~Writer(); + + Status AddRecord(const Slice& slice); + + private: + WritableFile* dest_; + int block_offset_; // Current offset in block + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + + // No copying allowed + Writer(const Writer&); + void operator=(const Writer&); +}; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/db/memtable.cc b/db/memtable.cc new file mode 100644 index 0000000..349cfcc --- /dev/null +++ b/db/memtable.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" +#include "db/dbformat.h" +#include "include/comparator.h" +#include "include/env.h" +#include "include/iterator.h" +#include "util/coding.h" + +namespace leveldb { + +static Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len; + const char* p = data; + p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted + return Slice(p, len); +} + +MemTable::MemTable(const InternalKeyComparator& cmp) + : comparator_(cmp), + table_(comparator_, &arena_) { +} + +MemTable::~MemTable() { +} + +size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } + +int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) + const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(aptr); + Slice b = GetLengthPrefixedSlice(bptr); + return comparator.Compare(a, b); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +static const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, target.size()); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator: public Iterator { + public: + explicit MemTableIterator(MemTable::Table* table) { + iter_ = new MemTable::Table::Iterator(table); + } + virtual ~MemTableIterator() { delete iter_; } + + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } + virtual Slice value() const { + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + virtual Status status() const { return Status::OK(); } + + private: + MemTable::Table::Iterator* iter_; + std::string tmp_; // For passing to EncodeKey + + // No copying allowed + MemTableIterator(const MemTableIterator&); + void operator=(const MemTableIterator&); +}; + +Iterator* MemTable::NewIterator() { + return new MemTableIterator(&table_); +} + +void MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, + const Slice& value) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + size_t key_size = key.size(); + size_t val_size = value.size(); + size_t internal_key_size = key_size + 8; + const size_t encoded_len = + VarintLength(internal_key_size) + internal_key_size + + VarintLength(val_size) + val_size; + char* buf = arena_.Allocate(encoded_len); + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + p += key_size; + EncodeFixed64(p, (s << 8) | type); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((p + val_size) - buf == encoded_len); + table_.Insert(buf); +} + +} diff --git a/db/memtable.h b/db/memtable.h new file mode 100644 index 0000000..fa95e15 --- /dev/null +++ b/db/memtable.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ +#define STORAGE_LEVELDB_DB_MEMTABLE_H_ + +#include +#include "include/db.h" +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "util/arena.h" + +namespace leveldb { + +class InternalKeyComparator; +class Mutex; +class MemTableIterator; + +class MemTable { + public: + explicit MemTable(const InternalKeyComparator& comparator); + ~MemTable(); + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + size_t ApproximateMemoryUsage(); + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/format.{h,cc} module. + Iterator* NewIterator(); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + void Add(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value); + + private: + struct KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + int operator()(const char* a, const char* b) const; + }; + friend class MemTableIterator; + friend class MemTableBackwardIterator; + + typedef SkipList Table; + + KeyComparator comparator_; + Arena arena_; + Table table_; + + // No copying allowed + MemTable(const MemTable&); + void operator=(const MemTable&); +}; + +} + +#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/db/repair.cc b/db/repair.cc new file mode 100644 index 0000000..0727914 --- /dev/null +++ b/db/repair.cc @@ -0,0 +1,396 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// We recover the contents of the descriptor from the other files we find. +// (1) Any log files are first converted to tables +// (2) We scan every table to compute +// (a) smallest/largest for the table +// (b) large value refs from the table +// (c) largest sequence number in the table +// (3) We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, +// large-value-refs, ...) in the table's meta section to speed up +// ScanTable. + +#include "db/builder.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "include/comparator.h" +#include "include/db.h" +#include "include/env.h" + +namespace leveldb { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const Options& options) + : dbname_(dbname), + env_(options.env), + icmp_(options.comparator), + options_(SanitizeOptions(dbname, &icmp_, options)), + owns_info_log_(options_.info_log != options.info_log), + next_file_number_(1) { + // TableCache can be small since we expect each table to be opened once. + table_cache_ = new TableCache(dbname_, &options_, 10); + } + + ~Repairer() { + delete table_cache_; + if (owns_info_log_) { + delete options_.info_log; + } + } + + Status Run() { + Status status = FindFiles(); + if (status.ok()) { + ConvertLogFilesToTables(); + ExtractMetaData(); + status = WriteDescriptor(); + } + if (status.ok()) { + unsigned long long bytes = 0; + for (int i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.file_size; + } + Log(env_, options_.info_log, + "**** Repaired leveldb %s; " + "recovered %d files; %llu bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), + static_cast(tables_.size()), + bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + SequenceNumber max_sequence; + }; + + std::string const dbname_; + Env* const env_; + InternalKeyComparator const icmp_; + Options const options_; + bool owns_info_log_; + TableCache* table_cache_; + VersionEdit edit_; + + std::vector manifests_; + std::vector table_numbers_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + + Status FindFiles() { + std::vector filenames; + Status status = env_->GetChildren(dbname_, &filenames); + if (!status.ok()) { + return status; + } + if (filenames.empty()) { + return Status::IOError(dbname_, "repair found no files"); + } + + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + if (type == kLargeValueFile) { + // Will be picked up when we process a Table that points to it + } else if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kLogFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_numbers_.push_back(number); + } else { + // Ignore other files + } + } + } + } + return status; + } + + void ConvertLogFilesToTables() { + for (int i = 0; i < logs_.size(); i++) { + std::string logname = LogFileName(dbname_, logs_[i]); + Status status = ConvertLogToTable(logs_[i]); + if (!status.ok()) { + Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", + (unsigned long long) logs_[i], + status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + WritableFile* info_log; + uint64_t lognum; + virtual void Corruption(size_t bytes, const Status& s) { + // We print error messages for corruption, but continue repairing. + Log(env, info_log, "Log #%llu: dropping %d bytes; %s", + (unsigned long long) lognum, + static_cast(bytes), + s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(dbname_, log); + SequentialFile* lfile; + Status status = env_->NewSequentialFile(logname, &lfile); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.lognum = log; + // We intentially make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(lfile, &reporter, false/*do not checksum*/); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable mem(icmp_); + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + status = WriteBatchInternal::InsertInto(&batch, &mem); + if (status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + Log(env_, options_.info_log, "Log #%llu: ignoring %s", + (unsigned long long) log, + status.ToString().c_str()); + status = Status::OK(); // Keep going with rest of file + } + } + delete lfile; + + // We ignore any version edits generated by the conversion to a Table + // since ExtractMetaData() will also generate edits. + VersionEdit skipped; + FileMetaData meta; + meta.number = next_file_number_++; + Iterator* iter = mem.NewIterator(); + status = BuildTable(dbname_, env_, options_, table_cache_, iter, + &meta, &skipped); + delete iter; + if (status.ok()) { + if (meta.file_size > 0) { + table_numbers_.push_back(meta.number); + } + } + Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", + (unsigned long long) log, + counter, + (unsigned long long) meta.number, + status.ToString().c_str()); + return status; + } + + void ExtractMetaData() { + std::vector kept; + for (int i = 0; i < table_numbers_.size(); i++) { + TableInfo t; + t.meta.number = table_numbers_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName(dbname_, table_numbers_[i]); + Log(env_, options_.info_log, "Table #%llu: ignoring %s", + (unsigned long long) table_numbers_[i], + status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName(dbname_, t->meta.number); + int counter = 0; + Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (status.ok()) { + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), t->meta.number); + bool empty = true; + ParsedInternalKey parsed; + t->max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(env_, options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t->meta.number, + EscapeString(key).c_str()); + continue; + } + + counter++; + if (empty) { + empty = false; + t->meta.smallest.DecodeFrom(key); + } + t->meta.largest.DecodeFrom(key); + if (parsed.sequence > t->max_sequence) { + t->max_sequence = parsed.sequence; + } + + if (ExtractValueType(key) == kTypeLargeValueRef) { + if (iter->value().size() != LargeValueRef::ByteSize()) { + Log(env_, options_.info_log, "Table #%llu: bad large value ref", + (unsigned long long) t->meta.number); + } else { + edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()), + t->meta.number, + key); + } + } + } + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; + } + Log(env_, options_.info_log, "Table #%llu: %d entries %s", + (unsigned long long) t->meta.number, + counter, + status.ToString().c_str()); + return status; + } + + Status WriteDescriptor() { + std::string tmp = TempFileName(dbname_, 1); + WritableFile* file; + Status status = env_->NewWritableFile(tmp, &file); + if (!status.ok()) { + return status; + } + + SequenceNumber max_sequence = 0; + for (int i = 0; i < tables_.size(); i++) { + if (max_sequence < tables_[i].max_sequence) { + max_sequence = tables_[i].max_sequence; + } + } + + edit_.SetComparatorName(icmp_.user_comparator()->Name()); + edit_.SetLogNumber(0); + edit_.SetNextFile(next_file_number_); + edit_.SetLastSequence(max_sequence); + + for (int i = 0; i < tables_.size(); i++) { + // TODO(opt): separate out into multiple levels + const TableInfo& t = tables_[i]; + edit_.AddFile(0, t.meta.number, t.meta.file_size, + t.meta.smallest, t.meta.largest); + } + + //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); + { + log::Writer log(file); + std::string record; + edit_.EncodeTo(&record); + status = log.AddRecord(record); + } + if (status.ok()) { + status = file->Close(); + } + delete file; + file = NULL; + + if (!status.ok()) { + env_->DeleteFile(tmp); + } else { + // Discard older manifests + for (int i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + + // Install new manifest + status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); + if (status.ok()) { + status = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(tmp); + } + } + return status; + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != NULL) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == NULL) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + Log(env_, options_.info_log, "Archiving %s: %s\n", + fname.c_str(), s.ToString().c_str()); + } +}; +} + +Status RepairDB(const std::string& dbname, const Options& options) { + Repairer repairer(dbname, options); + return repairer.Run(); +} + +} diff --git a/db/skiplist.h b/db/skiplist.h new file mode 100644 index 0000000..be39354 --- /dev/null +++ b/db/skiplist.h @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... + +#include +#include +#include "port/port.h" +#include "util/arena.h" +#include "util/random.h" + +namespace leveldb { + +class Arena; + +template +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + enum { kMaxHeight = 12 }; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + inline int GetMaxHeight() const { + return reinterpret_cast(max_height_.NoBarrier_Load()); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return NULL if there is no such node. + // + // If prev is non-NULL, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template +struct SkipList::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template +typename SkipList::Node* +SkipList::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template +inline SkipList::Iterator::Iterator(const SkipList* list) { + list_ = list; + node_ = NULL; +} + +template +inline bool SkipList::Iterator::Valid() const { + return node_ != NULL; +} + +template +inline const Key& SkipList::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template +inline void SkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void SkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template +inline void SkipList::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, NULL); +} + +template +inline void SkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void SkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template +int SkipList::RandomHeight() { + // Increase height with probability 1 in kBranching + static const unsigned int kBranching = 4; + int height = 1; + while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight); + return height; +} + +template +bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { + // NULL n is considered infinite + return (n != NULL) && (compare_(n->key, key) < 0); +} + +template +typename SkipList::Node* SkipList::FindGreaterOrEqual(const Key& key, Node** prev) + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != NULL) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template +typename SkipList::Node* +SkipList::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == NULL || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +typename SkipList::Node* SkipList::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == NULL) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +SkipList::SkipList(Comparator cmp, Arena* arena) + : compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, kMaxHeight)), + max_height_(reinterpret_cast(1)), + rnd_(0xdeadbeef) { + for (int i = 0; i < kMaxHeight; i++) { + head_->SetNext(i, NULL); + } +} + +template +void SkipList::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* prev[kMaxHeight]; + Node* x = FindGreaterOrEqual(key, prev); + + // Our data structure does not allow duplicate insertion + assert(x == NULL || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (NULL), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since NULL sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); + prev[i]->SetNext(i, x); + } +} + +template +bool SkipList::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, NULL); + if (x != NULL && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc new file mode 100644 index 0000000..0cfc893 --- /dev/null +++ b/db/skiplist_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/skiplist.h" +#include +#include "include/env.h" +#include "util/arena.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { + +typedef uint64_t Key; + +struct Comparator { + int operator()(const Key& a, const Key& b) const { + if (a < b) { + return -1; + } else if (a > b) { + return +1; + } else { + return 0; + } + } +}; + +class SkipTest { }; + +TEST(SkipTest, Empty) { + Arena arena; + Comparator cmp; + SkipList list(cmp, &arena); + ASSERT_TRUE(!list.Contains(10)); + + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + iter.Seek(100); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToLast(); + ASSERT_TRUE(!iter.Valid()); +} + +TEST(SkipTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + Arena arena; + Comparator cmp; + SkipList list(cmp, &arena); + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + list.Insert(key); + } + } + + for (int i = 0; i < R; i++) { + if (list.Contains(i)) { + ASSERT_EQ(keys.count(i), 1); + } else { + ASSERT_EQ(keys.count(i), 0); + } + } + + // Simple iterator tests + { + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + iter.Seek(0); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), iter.key()); + } + + // Forward iteration test + for (int i = 0; i < R; i++) { + SkipList::Iterator iter(&list); + iter.Seek(i); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + { + SkipList::Iterator iter(&list); + iter.SeekToLast(); + + // Compare against model iterator + for (std::set::reverse_iterator model_iter = keys.rbegin(); + model_iter != keys.rend(); + ++model_iter) { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + iter.Prev(); + } + ASSERT_TRUE(!iter.Valid()); + } +} + +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + private: + static const uint32_t K = 4; + + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = { k, g }; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + port::AtomicPointer generation[K]; + void Set(int k, intptr_t v) { + generation[k].Release_Store(reinterpret_cast(v)); + } + intptr_t Get(int k) { + return reinterpret_cast(generation[k].Acquire_Load()); + } + + State() { + for (int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + Arena arena_; + + // SkipList is not protected by mu_. We just use a single writer + // thread to modify it. + SkipList list_; + + public: + ConcurrentTest() : list_(Comparator(), &arena_) { } + + // REQUIRES: External synchronization + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const intptr_t g = current_.Get(k) + 1; + const Key key = MakeKey(k, g); + list_.Insert(key); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + SkipList::Iterator iter(&list_); + iter.Seek(pos); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = iter.key(); + ASSERT_TRUE(IsValidKey(current)) << std::hex << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << std::hex << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0) || + (gen(pos) > initial_state.Get(key(pos))) + ) << "key: " << key(pos) + << "; gen: " << gen(pos) + << "; initgen: " + << initial_state.Get(key(pos)); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(new_target); + } + } + } + } +}; +const uint32_t ConcurrentTest::K; + +// Simple test that does single-threaded testing of the ConcurrentTest +// scaffolding. +TEST(SkipTest, ConcurrentWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + test.WriteStep(&rnd); + } +} + +class TestState { + public: + ConcurrentTest t_; + int seed_; + port::AtomicPointer quit_flag_; + + enum ReaderState { + STARTING, + RUNNING, + DONE + }; + + explicit TestState(int s) + : seed_(s), + quit_flag_(NULL), + state_(STARTING), + state_cv_(&mu_) {} + + void Wait(ReaderState s) { + mu_.Lock(); + while (state_ != s) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + void Change(ReaderState s) { + mu_.Lock(); + state_ = s; + state_cv_.Signal(); + mu_.Unlock(); + } + + private: + port::Mutex mu_; + ReaderState state_; + port::CondVar state_cv_; +}; + +static void ConcurrentReader(void* arg) { + TestState* state = reinterpret_cast(arg); + Random rnd(state->seed_); + int64_t reads = 0; + state->Change(TestState::RUNNING); + while (!state->quit_flag_.Acquire_Load()) { + state->t_.ReadStep(&rnd); + ++reads; + } + state->Change(TestState::DONE); +} + +static void RunConcurrent(int run) { + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int i = 0; i < kSize; i++) { + state.t_.WriteStep(&rnd); + } + state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do + state.Wait(TestState::DONE); + } +} + +TEST(SkipTest, Concurrent1) { RunConcurrent(1); } +TEST(SkipTest, Concurrent2) { RunConcurrent(2); } +TEST(SkipTest, Concurrent3) { RunConcurrent(3); } +TEST(SkipTest, Concurrent4) { RunConcurrent(4); } +TEST(SkipTest, Concurrent5) { RunConcurrent(5); } + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/snapshot.h b/db/snapshot.h new file mode 100644 index 0000000..6a07f80 --- /dev/null +++ b/db/snapshot.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ +#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ + +#include "include/db.h" + +namespace leveldb { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each Snapshot corresponds to a particular sequence number. +class Snapshot { + public: + SequenceNumber number_; // const after creation + + private: + friend class SnapshotList; + + // Snapshot is kept in a doubly-linked circular list + Snapshot* prev_; + Snapshot* next_; + + SnapshotList* list_; // just for sanity checks +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + } + + bool empty() const { return list_.next_ == &list_; } + Snapshot* oldest() const { assert(!empty()); return list_.next_; } + Snapshot* newest() const { assert(!empty()); return list_.prev_; } + + const Snapshot* New(SequenceNumber seq) { + Snapshot* s = new Snapshot; + s->number_ = seq; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + return s; + } + + void Delete(const Snapshot* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + delete s; + } + + private: + // Dummy head of doubly-linked list of snapshots + Snapshot list_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/db/table_cache.cc b/db/table_cache.cc new file mode 100644 index 0000000..604298d --- /dev/null +++ b/db/table_cache.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/filename.h" +#include "include/env.h" +#include "include/table.h" +#include "util/coding.h" + +namespace leveldb { + +struct TableAndFile { + RandomAccessFile* file; + Table* table; +}; + +static void DeleteEntry(const Slice& key, void* value) { + TableAndFile* tf = reinterpret_cast(value); + delete tf->table; + delete tf->file; + delete tf; +} + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +TableCache::TableCache(const std::string& dbname, + const Options* options, + int entries) + : env_(options->env), + dbname_(dbname), + options_(options), + cache_(NewLRUCache(entries)) { +} + +TableCache::~TableCache() { + delete cache_; +} + +Iterator* TableCache::NewIterator(const ReadOptions& options, + uint64_t file_number, + Table** tableptr) { + if (tableptr != NULL) { + *tableptr = NULL; + } + + char buf[sizeof(file_number)]; + EncodeFixed64(buf, file_number); + Slice key(buf, sizeof(buf)); + Cache::Handle* handle = cache_->Lookup(key); + if (handle == NULL) { + std::string fname = TableFileName(dbname_, file_number); + RandomAccessFile* file = NULL; + Table* table = NULL; + Status s = env_->NewRandomAccessFile(fname, &file); + if (s.ok()) { + s = Table::Open(*options_, file, &table); + } + + if (!s.ok()) { + assert(table == NULL); + delete file; + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + return NewErrorIterator(s); + } + + TableAndFile* tf = new TableAndFile; + tf->file = file; + tf->table = table; + handle = cache_->Insert(key, tf, 1, &DeleteEntry); + } + + Table* table = reinterpret_cast(cache_->Value(handle))->table; + Iterator* result = table->NewIterator(options); + result->RegisterCleanup(&UnrefEntry, cache_, handle); + if (tableptr != NULL) { + *tableptr = table; + } + return result; +} + +void TableCache::Evict(uint64_t file_number) { + char buf[sizeof(file_number)]; + EncodeFixed64(buf, file_number); + cache_->Erase(Slice(buf, sizeof(buf))); +} + +} diff --git a/db/table_cache.h b/db/table_cache.h new file mode 100644 index 0000000..6c357df --- /dev/null +++ b/db/table_cache.h @@ -0,0 +1,49 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ +#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ + +#include +#include +#include "db/dbformat.h" +#include "include/cache.h" +#include "include/table.h" +#include "port/port.h" + +namespace leveldb { + +class Env; + +class TableCache { + public: + TableCache(const std::string& dbname, const Options* options, int entries); + ~TableCache(); + + // Get an iterator for the specified file number and return it. If + // "tableptr" is non-NULL, also sets "*tableptr" to point to the + // Table object underlying the returned iterator, or NULL if no + // Table object underlies the returned iterator. The returned + // "*tableptr" object is owned by the cache and should not be + // deleted, and is valid for as long as the returned iterator is + // live. + Iterator* NewIterator(const ReadOptions& options, + uint64_t file_number, + Table** tableptr = NULL); + + // Evict any entry for the specified file number + void Evict(uint64_t file_number); + + private: + Env* const env_; + const std::string dbname_; + const Options* options_; + Cache* cache_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/db/version_edit.cc b/db/version_edit.cc new file mode 100644 index 0000000..809dd82 --- /dev/null +++ b/db/version_edit.cc @@ -0,0 +1,282 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/version_set.h" +#include "util/coding.h" + +namespace leveldb { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. +enum Tag { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + kLargeValueRef = 8, +}; + +void VersionEdit::Clear() { + comparator_.clear(); + log_number_ = 0; + last_sequence_ = 0; + next_file_number_ = 0; + has_comparator_ = false; + has_log_number_ = false; + has_next_file_number_ = false; + has_last_sequence_ = false; + deleted_files_.clear(); + new_files_.clear(); + large_refs_added_.clear(); +} + +void VersionEdit::EncodeTo(std::string* dst) const { + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32(dst, kLogNumber); + PutVarint64(dst, log_number_); + } + if (has_next_file_number_) { + PutVarint32(dst, kNextFileNumber); + PutVarint64(dst, next_file_number_); + } + if (has_last_sequence_) { + PutVarint32(dst, kLastSequence); + PutVarint64(dst, last_sequence_); + } + + for (int i = 0; i < compact_pointers_.size(); i++) { + PutVarint32(dst, kCompactPointer); + PutVarint32(dst, compact_pointers_[i].first); // level + PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); + } + + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + PutVarint32(dst, kDeletedFile); + PutVarint32(dst, iter->first); // level + PutVarint64(dst, iter->second); // file number + } + + for (int i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + PutVarint32(dst, kNewFile); + PutVarint32(dst, new_files_[i].first); // level + PutVarint64(dst, f.number); + PutVarint64(dst, f.file_size); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + } + + for (int i = 0; i < large_refs_added_.size(); i++) { + const VersionEdit::Large& l = large_refs_added_[i]; + PutVarint32(dst, kLargeValueRef); + PutLengthPrefixedSlice(dst, + Slice(l.large_ref.data, LargeValueRef::ByteSize())); + PutVarint64(dst, l.fnum); + PutLengthPrefixedSlice(dst, l.internal_key.Encode()); + } +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return true; + } else { + return false; + } +} + +static bool GetLevel(Slice* input, int* level) { + uint32_t v; + if (GetVarint32(input, &v) && + v < config::kNumLevels) { + *level = v; + return true; + } else { + return false; + } +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); + Slice input = src; + const char* msg = NULL; + uint32_t tag; + + // Temporary storage for parsing + int level; + uint64_t number; + FileMetaData f; + Slice str; + Large large; + InternalKey key; + + while (msg == NULL && GetVarint32(&input, &tag)) { + switch (tag) { + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactPointer: + if (GetLevel(&input, &level) && + GetInternalKey(&input, &key)) { + compact_pointers_.push_back(std::make_pair(level, key)); + } else { + msg = "compaction pointer"; + } + break; + + case kDeletedFile: + if (GetLevel(&input, &level) && + GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + msg = "deleted file"; + } + break; + + case kNewFile: + if (GetLevel(&input, &level) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + new_files_.push_back(std::make_pair(level, f)); + } else { + msg = "new-file entry"; + } + break; + + case kLargeValueRef: + if (GetLengthPrefixedSlice(&input, &str) && + (str.size() == LargeValueRef::ByteSize()) && + GetVarint64(&input, &large.fnum) && + GetInternalKey(&input, &large.internal_key)) { + large.large_ref = LargeValueRef::FromRef(str); + large_refs_added_.push_back(large); + } else { + msg = "large ref"; + } + break; + + default: + msg = "unknown tag"; + break; + } + } + + if (msg == NULL && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != NULL) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString() const { + std::string r; + r.append("VersionEdit {"); + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFile: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (int i = 0; i < compact_pointers_.size(); i++) { + r.append("\n CompactPointer: "); + AppendNumberTo(&r, compact_pointers_[i].first); + r.append(" '"); + AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); + r.append("'"); + } + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, iter->first); + r.append(" "); + AppendNumberTo(&r, iter->second); + } + for (int i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.number); + r.append(" "); + AppendNumberTo(&r, f.file_size); + r.append(" '"); + AppendEscapedStringTo(&r, f.smallest.Encode()); + r.append("' .. '"); + AppendEscapedStringTo(&r, f.largest.Encode()); + r.append("'"); + } + for (int i = 0; i < large_refs_added_.size(); i++) { + const VersionEdit::Large& l = large_refs_added_[i]; + r.append("\n LargeRef: "); + AppendNumberTo(&r, l.fnum); + r.append(" "); + r.append(LargeValueRefToFilenameString(l.large_ref)); + r.append(" '"); + AppendEscapedStringTo(&r, l.internal_key.Encode()); + r.append("'"); + } + r.append("\n}\n"); + return r; +} + +} diff --git a/db/version_edit.h b/db/version_edit.h new file mode 100644 index 0000000..1b71283 --- /dev/null +++ b/db/version_edit.h @@ -0,0 +1,118 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ +#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ + +#include +#include +#include +#include "db/dbformat.h" + +namespace leveldb { + +class VersionSet; + +struct FileMetaData { + int refs; + uint64_t number; + uint64_t file_size; // File size in bytes + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + + FileMetaData() : refs(0), file_size(0) { } +}; + +class VersionEdit { + public: + VersionEdit() { Clear(); } + ~VersionEdit() { } + + void Clear(); + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + void SetCompactPointer(int level, const InternalKey& key) { + compact_pointers_.push_back(std::make_pair(level, key)); + } + + // Add the specified file at the specified number. + // REQUIRES: This version has not been saved (see VersionSet::SaveTo) + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + void AddFile(int level, uint64_t file, + uint64_t file_size, + const InternalKey& smallest, + const InternalKey& largest) { + FileMetaData f; + f.number = file; + f.file_size = file_size; + f.smallest = smallest; + f.largest = largest; + new_files_.push_back(std::make_pair(level, f)); + } + + // Delete the specified "file" from the specified "level". + void DeleteFile(int level, uint64_t file) { + deleted_files_.insert(std::make_pair(level, file)); + } + + // Record that a large value with the specified large_ref was + // written to the output file numbered "fnum" + void AddLargeValueRef(const LargeValueRef& large_ref, + uint64_t fnum, + const Slice& internal_key) { + large_refs_added_.resize(large_refs_added_.size() + 1); + Large* large = &(large_refs_added_.back()); + large->large_ref = large_ref; + large->fnum = fnum; + large->internal_key.DecodeFrom(internal_key); + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString() const; + + private: + friend class VersionSet; + + typedef std::set< std::pair > DeletedFileSet; + + std::string comparator_; + uint64_t log_number_; + uint64_t next_file_number_; + SequenceNumber last_sequence_; + bool has_comparator_; + bool has_log_number_; + bool has_next_file_number_; + bool has_last_sequence_; + + std::vector< std::pair > compact_pointers_; + DeletedFileSet deleted_files_; + std::vector< std::pair > new_files_; + struct Large { + LargeValueRef large_ref; + uint64_t fnum; + InternalKey internal_key; + }; + std::vector large_refs_added_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc new file mode 100644 index 0000000..50913cd --- /dev/null +++ b/db/version_edit_test.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" +#include "util/testharness.h" + +namespace leveldb { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest { }; + +TEST(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); + edit.DeleteFile(4, kBig + 700 + i); + edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression), + kBig + 800 + i, "foobar"); + edit.AddLargeValueRef(LargeValueRef::Make("big2", kLightweightCompression), + kBig + 801 + i, "baz"); + edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/version_set.cc b/db/version_set.cc new file mode 100644 index 0000000..2435fa2 --- /dev/null +++ b/db/version_set.cc @@ -0,0 +1,1003 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include +#include +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "include/env.h" +#include "include/table_builder.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +static double MaxBytesForLevel(int level) { + if (level == 0) { + return 4 * 1048576.0; + } else { + double result = 10 * 1048576.0; + while (level > 1) { + result *= 10; + level--; + } + return result; + } +} + +static uint64_t MaxFileSizeForLevel(int level) { + return 2 << 20; // We could vary per level to reduce number of files? +} + +namespace { +std::string IntSetToString(const std::set& s) { + std::string result = "{"; + for (std::set::const_iterator it = s.begin(); + it != s.end(); + ++it) { + result += (result.size() > 1) ? "," : ""; + result += NumberToString(*it); + } + result += "}"; + return result; +} +} + +Version::~Version() { + assert(refs_ == 0); + for (int level = 0; level < config::kNumLevels; level++) { + for (int i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + assert(f->refs >= 0); + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + delete cleanup_mem_; +} + +// An internal iterator. For a given version/level pair, yields +// information about the files in the level. For a given entry, key() +// is the largest key that occurs in the file, and value() is an +// 8-byte value containing the file number of the file, encoding using +// EncodeFixed64. +class Version::LevelFileNumIterator : public Iterator { + public: + LevelFileNumIterator(const Version* version, + const std::vector* flist) + : icmp_(version->vset_->icmp_.user_comparator()), + flist_(flist), + index_(flist->size()) { // Marks as invalid + } + virtual bool Valid() const { + return index_ < flist_->size(); + } + virtual void Seek(const Slice& target) { + uint32_t left = 0; + uint32_t right = flist_->size() - 1; + while (left < right) { + uint32_t mid = (left + right) / 2; + int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target); + if (cmp < 0) { + // Key at "mid.largest" is < than "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + index_ = left; + } + virtual void SeekToFirst() { index_ = 0; } + virtual void SeekToLast() { + index_ = flist_->empty() ? 0 : flist_->size() - 1; + } + virtual void Next() { + assert(Valid()); + index_++; + } + virtual void Prev() { + assert(Valid()); + if (index_ == 0) { + index_ = flist_->size(); // Marks as invalid + } else { + index_--; + } + } + Slice key() const { + assert(Valid()); + return (*flist_)[index_]->largest.Encode(); + } + Slice value() const { + assert(Valid()); + EncodeFixed64(value_buf_, (*flist_)[index_]->number); + return Slice(value_buf_, sizeof(value_buf_)); + } + virtual Status status() const { return Status::OK(); } + private: + const InternalKeyComparator icmp_; + const std::vector* const flist_; + int index_; + + mutable char value_buf_[8]; // Used for encoding the file number for value() +}; + +static Iterator* GetFileIterator(void* arg, + const ReadOptions& options, + const Slice& file_value) { + TableCache* cache = reinterpret_cast(arg); + if (file_value.size() != 8) { + return NewErrorIterator( + Status::Corruption("FileReader invoked with unexpected value")); + } else { + return cache->NewIterator(options, DecodeFixed64(file_value.data())); + } +} + +Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, + int level) const { + return NewTwoLevelIterator( + new LevelFileNumIterator(this, &files_[level]), + &GetFileIterator, vset_->table_cache_, options); +} + +void Version::AddIterators(const ReadOptions& options, + std::vector* iters) { + // Merge all level zero files together since they may overlap + for (int i = 0; i < files_[0].size(); i++) { + iters->push_back( + vset_->table_cache_->NewIterator(options, files_[0][i]->number)); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < config::kNumLevels; level++) { + if (!files_[level].empty()) { + iters->push_back(NewConcatenatingIterator(options, level)); + } + } +} + +void Version::Ref() { + ++refs_; +} + +void Version::Unref() { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + vset_->MaybeDeleteOldVersions(); + // TODO: try to delete obsolete files + } +} + +std::string Version::DebugString() const { + std::string r; + for (int level = 0; level < config::kNumLevels; level++) { + // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g'] + r.append("level "); + AppendNumberTo(&r, level); + r.push_back(':'); + const std::vector& files = files_[level]; + for (int i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->number); + r.push_back(':'); + AppendNumberTo(&r, files[i]->file_size); + r.append("['"); + AppendEscapedStringTo(&r, files[i]->smallest.Encode()); + r.append("' .. '"); + AppendEscapedStringTo(&r, files[i]->largest.Encode()); + r.append("']"); + } + r.push_back('\n'); + } + return r; +} + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionSet::Builder { + private: + typedef std::map FileMap; + VersionSet* vset_; + FileMap files_[config::kNumLevels]; + + public: + // Initialize a builder with the files from *base and other info from *vset + Builder(VersionSet* vset, Version* base) + : vset_(vset) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = base->files_[level]; + for (int i = 0; i < files.size(); i++) { + FileMetaData* f = files[i]; + f->refs++; + files_[level].insert(std::make_pair(f->number, f)); + } + } + } + + ~Builder() { + for (int level = 0; level < config::kNumLevels; level++) { + const FileMap& fmap = files_[level]; + for (FileMap::const_iterator iter = fmap.begin(); + iter != fmap.end(); + ++iter) { + FileMetaData* f = iter->second; + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + } + + // Apply all of the edits in *edit to the current state. + void Apply(VersionEdit* edit) { + // Update compaction pointers + for (int i = 0; i < edit->compact_pointers_.size(); i++) { + const int level = edit->compact_pointers_[i].first; + vset_->compact_pointer_[level] = + edit->compact_pointers_[i].second.Encode().ToString(); + } + + // Delete files + const VersionEdit::DeletedFileSet& del = edit->deleted_files_; + for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); + iter != del.end(); + ++iter) { + const int level = iter->first; + const uint64_t number = iter->second; + FileMap::iterator fiter = files_[level].find(number); + assert(fiter != files_[level].end()); // Sanity check for debug mode + if (fiter != files_[level].end()) { + FileMetaData* f = fiter->second; + f->refs--; + if (f->refs <= 0) { + delete f; + } + files_[level].erase(fiter); + } + } + + // Add new files + for (int i = 0; i < edit->new_files_.size(); i++) { + const int level = edit->new_files_[i].first; + FileMetaData* f = new FileMetaData(edit->new_files_[i].second); + f->refs = 1; + assert(files_[level].count(f->number) == 0); + files_[level].insert(std::make_pair(f->number, f)); + } + + // Add large value refs + for (int i = 0; i < edit->large_refs_added_.size(); i++) { + const VersionEdit::Large& l = edit->large_refs_added_[i]; + vset_->RegisterLargeValueRef(l.large_ref, l.fnum, l.internal_key); + } + } + + // Save the current state in *v. + void SaveTo(Version* v) { + for (int level = 0; level < config::kNumLevels; level++) { + const FileMap& fmap = files_[level]; + for (FileMap::const_iterator iter = fmap.begin(); + iter != fmap.end(); + ++iter) { + FileMetaData* f = iter->second; + f->refs++; + v->files_[level].push_back(f); + } + } + } +}; + +VersionSet::VersionSet(const std::string& dbname, + const Options* options, + TableCache* table_cache, + const InternalKeyComparator* cmp) + : env_(options->env), + dbname_(dbname), + options_(options), + table_cache_(table_cache), + icmp_(*cmp), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + descriptor_file_(NULL), + descriptor_log_(NULL), + current_(new Version(this)), + oldest_(current_) { +} + +VersionSet::~VersionSet() { + for (Version* v = oldest_; v != NULL; ) { + Version* next = v->next_; + assert(v->refs_ == 0); + delete v; + v = next; + } + delete descriptor_log_; + delete descriptor_file_; +} + +Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { + edit->SetNextFile(next_file_number_); + + Version* v = new Version(this); + { + Builder builder(this, current_); + builder.Apply(edit); + builder.SaveTo(v); + } + + std::string new_manifest_file; + Status s = Finalize(v); + + // Initialize new descriptor log file if necessary by creating + // a temporary file that contains a snapshot of the current version. + if (s.ok()) { + if (descriptor_log_ == NULL) { + assert(descriptor_file_ == NULL); + new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); + edit->SetNextFile(next_file_number_); + s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); + if (s.ok()) { + descriptor_log_ = new log::Writer(descriptor_file_); + s = WriteSnapshot(descriptor_log_); + } + } + } + + // Write new record to log file + if (s.ok()) { + std::string record; + edit->EncodeTo(&record); + s = descriptor_log_->AddRecord(record); + if (s.ok()) { + s = descriptor_file_->Sync(); + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && !new_manifest_file.empty()) { + s = SetCurrentFile(env_, dbname_, manifest_file_number_); + } + + // Install the new version + if (s.ok()) { + assert(current_->next_ == NULL); + assert(current_->cleanup_mem_ == NULL); + current_->cleanup_mem_ = cleanup_mem; + v->next_ = NULL; + current_->next_ = v; + current_ = v; + } else { + delete v; + if (!new_manifest_file.empty()) { + delete descriptor_log_; + delete descriptor_file_; + descriptor_log_ = NULL; + descriptor_file_ = NULL; + env_->DeleteFile(new_manifest_file); + } + } + //Log(env_, options_->info_log, "State\n%s", current_->DebugString().c_str()); + + return s; +} + +Status VersionSet::Recover(uint64_t* log_number, + SequenceNumber* last_sequence) { + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string current; + Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); + if (!s.ok()) { + return s; + } + if (current.empty() || current[current.size()-1] != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + current.resize(current.size() - 1); + + std::string dscname = dbname_ + "/" + current; + SequentialFile* file; + s = env_->NewSequentialFile(dscname, &file); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + Builder builder(this, current_); + + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(file, &reporter, true/*checksum*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (s.ok()) { + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument( + edit.comparator_ + "does not match existing comparator ", + icmp_.user_comparator()->Name()); + } + } + + if (s.ok()) { + builder.Apply(&edit); + } + + if (edit.has_log_number_) { + *log_number = edit.log_number_; + have_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + *last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + delete file; + file = NULL; + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + } + + if (s.ok()) { + Version* v = new Version(this); + builder.SaveTo(v); + s = Finalize(v); + if (!s.ok()) { + delete v; + } else { + // Install recovered version + v->next_ = NULL; + current_->next_ = v; + current_ = v; + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + } + } + + return s; +} + +Status VersionSet::Finalize(Version* v) { + // Precomputed best level for next compaction + int best_level = -1; + double best_score = -1; + + Status s; + for (int level = 0; s.ok() && level < config::kNumLevels; level++) { + s = SortLevel(v, level); + + // Compute the ratio of current size to size limit. + uint64_t level_bytes = 0; + for (int i = 0; i < v->files_[level].size(); i++) { + level_bytes += v->files_[level][i]->file_size; + } + double score = static_cast(level_bytes) / MaxBytesForLevel(level); + + if (level == 0) { + // Level-0 file sizes are going to be often much smaller than + // MaxBytesForLevel(0) since we do not account for compression + // when producing a level-0 file; and too many level-0 files + // increase merging costs. So use a file-count limit for + // level-0 in addition to the byte-count limit. + double count_score = v->files_[level].size() / 4.0; + if (count_score > score) { + score = count_score; + } + } + + if (score > best_score) { + best_level = level; + best_score = score; + } + } + + v->compaction_level_ = best_level; + v->compaction_score_ = best_score; + return s; +} + +Status VersionSet::WriteSnapshot(log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // Save metadata + VersionEdit edit; + edit.SetComparatorName(icmp_.user_comparator()->Name()); + + // Save compaction pointers + for (int level = 0; level < config::kNumLevels; level++) { + if (!compact_pointer_[level].empty()) { + InternalKey key; + key.DecodeFrom(compact_pointer_[level]); + edit.SetCompactPointer(level, key); + } + } + + // Save files + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = current_->files_[level]; + for (int i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); + } + } + + // Save large value refs + for (LargeValueMap::const_iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ++it) { + const LargeValueRef& ref = it->first; + const LargeReferencesSet& pointers = it->second; + for (LargeReferencesSet::const_iterator j = pointers.begin(); + j != pointers.end(); + ++j) { + edit.AddLargeValueRef(ref, j->first, j->second); + } + } + + std::string record; + edit.EncodeTo(&record); + return log->AddRecord(record); +} + +// Helper to sort by tables_[file_number].smallest +struct VersionSet::BySmallestKey { + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + return internal_comparator->Compare(f1->smallest, f2->smallest) < 0; + } +}; + +Status VersionSet::SortLevel(Version* v, uint64_t level) { + Status result; + BySmallestKey cmp; + cmp.internal_comparator = &icmp_; + std::sort(v->files_[level].begin(), v->files_[level].end(), cmp); + + if (result.ok() && level > 0) { + // There should be no overlap + for (int i = 1; i < v->files_[level].size(); i++) { + const InternalKey& prev_end = v->files_[level][i-1]->largest; + const InternalKey& this_begin = v->files_[level][i]->smallest; + if (icmp_.Compare(prev_end, this_begin) >= 0) { + result = Status::Corruption( + "overlapping ranges in same level", + (EscapeString(prev_end.Encode()) + " vs. " + + EscapeString(this_begin.Encode()))); + break; + } + } + } + return result; +} + +int VersionSet::NumLevelFiles(int level) const { + assert(level >= 0); + assert(level < config::kNumLevels); + return current_->files_[level].size(); +} + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { + uint64_t result = 0; + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (int i = 0; i < files.size(); i++) { + if (icmp_.Compare(files[i]->largest, ikey) <= 0) { + // Entire file is before "ikey", so just add the file size + result += files[i]->file_size; + } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { + // Entire file is after "ikey", so ignore + if (level > 0) { + // Files other than level 0 are sorted by meta->smallest, so + // no further files in this level will contain data for + // "ikey". + break; + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + Table* tableptr; + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), files[i]->number, &tableptr); + if (tableptr != NULL) { + result += tableptr->ApproximateOffsetOf(ikey.Encode()); + } + delete iter; + } + } + } + + // Add in large value files which are references from internal keys + // stored in the table files + // + // TODO(opt): this is O(# large values in db). If this becomes too slow, + // we could store an auxiliary data structure indexed by internal key + for (LargeValueMap::const_iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ++it) { + const LargeValueRef& lref = it->first; + for (LargeReferencesSet::const_iterator it2 = it->second.begin(); + it2 != it->second.end(); + ++it2) { + if (icmp_.Compare(it2->second, ikey.Encode()) <= 0) { + // Internal key for large value is before our key of interest + result += lref.ValueSize(); + } + } + } + + + return result; +} + +bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref, + uint64_t fnum, + const InternalKey& internal_key) { + LargeReferencesSet* refs = &large_value_refs_[large_ref]; + bool is_first = refs->empty(); + refs->insert(make_pair(fnum, internal_key.Encode().ToString())); + return is_first; +} + +void VersionSet::CleanupLargeValueRefs(const std::set& live_tables, + uint64_t log_file_num) { + for (LargeValueMap::iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ) { + LargeReferencesSet* refs = &it->second; + for (LargeReferencesSet::iterator ref_it = refs->begin(); + ref_it != refs->end(); + ) { + if (ref_it->first != log_file_num && // Not in log file + live_tables.count(ref_it->first) == 0) { // Not in a live table + // No longer live: erase + LargeReferencesSet::iterator to_erase = ref_it; + ++ref_it; + refs->erase(to_erase); + } else { + // Still live: leave this reference alone + ++ref_it; + } + } + if (refs->empty()) { + // No longer any live references to this large value: remove from + // large_value_refs + Log(env_, options_->info_log, "large value is dead: '%s'", + LargeValueRefToFilenameString(it->first).c_str()); + LargeValueMap::iterator to_erase = it; + ++it; + large_value_refs_.erase(to_erase); + } else { + ++it; + } + } +} + +bool VersionSet::LargeValueIsLive(const LargeValueRef& large_ref) { + LargeValueMap::iterator it = large_value_refs_.find(large_ref); + if (it == large_value_refs_.end()) { + return false; + } else { + assert(!it->second.empty()); + return true; + } +} + +void VersionSet::MaybeDeleteOldVersions() { + // Note: it is important to delete versions in order since a newer + // version with zero refs may be holding a pointer to a memtable + // that is used by somebody who has a ref on an older version. + while (oldest_ != current_ && oldest_->refs_ == 0) { + Version* next = oldest_->next_; + delete oldest_; + oldest_ = next; + } +} + +void VersionSet::AddLiveFiles(std::set* live) { + for (Version* v = oldest_; v != NULL; v = v->next_) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (int i = 0; i < files.size(); i++) { + live->insert(files[i]->number); + } + } + } +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +void VersionSet::GetOverlappingInputs( + int level, + const InternalKey& begin, + const InternalKey& end, + std::vector* inputs) { + inputs->clear(); + Slice user_begin = begin.user_key(); + Slice user_end = end.user_key(); + const Comparator* user_cmp = icmp_.user_comparator(); + for (int i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || + user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { + // Either completely before or after range; skip it + } else { + inputs->push_back(f); + } + } +} + +// Stores the minimal range that covers all entries in inputs in +// *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest) { + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + for (int i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_.Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_.Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } +} + +Iterator* VersionSet::MakeInputIterator(Compaction* c) { + ReadOptions options; + options.verify_checksums = options_->paranoid_checks; + options.fill_cache = false; + + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); + Iterator** list = new Iterator*[space]; + int num = 0; + for (int which = 0; which < 2; which++) { + if (!c->inputs_[which].empty()) { + if (c->level() + which == 0) { + const std::vector& files = c->inputs_[which]; + for (int i = 0; i < files.size(); i++) { + list[num++] = table_cache_->NewIterator(options, files[i]->number); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = NewTwoLevelIterator( + new Version::LevelFileNumIterator( + c->input_version_, &c->inputs_[which]), + &GetFileIterator, table_cache_, options); + } + } + } + assert(num <= space); + Iterator* result = NewMergingIterator(&icmp_, list, num); + delete[] list; + return result; +} + +Compaction* VersionSet::PickCompaction() { + if (!NeedsCompaction()) { + return NULL; + } + const int level = current_->compaction_level_; + assert(level >= 0); + + Compaction* c = new Compaction(level); + c->input_version_ = current_; + c->input_version_->Ref(); + + // Pick the first file that comes after compact_pointer_[level] + for (int i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (compact_pointer_[level].empty() || + icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { + c->inputs_[0].push_back(f); + break; + } + } + if (c->inputs_[0].empty()) { + // Wrap-around to the beginning of the key space + c->inputs_[0].push_back(current_->files_[level][0]); + } + + // Find the range we are compacting + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + + // Files in level 0 may overlap each other, so pick up all overlapping ones + if (level == 0) { + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); + assert(!c->inputs_[0].empty()); + GetRange(c->inputs_[0], &smallest, &largest); + } + + GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + + // See if we can grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. + if (!c->inputs_[1].empty()) { + // Get entire range covered by compaction + std::vector all = c->inputs_[0]; + all.insert(all.end(), c->inputs_[1].begin(), c->inputs_[1].end()); + InternalKey all_start, all_limit; + GetRange(all, &all_start, &all_limit); + + std::vector expanded0; + GetOverlappingInputs(level, all_start, all_limit, &expanded0); + if (expanded0.size() > c->inputs_[0].size()) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); + if (expanded1.size() == c->inputs_[1].size()) { + Log(env_, options_->info_log, + "Expanding@%d %d+%d to %d+%d\n", + level, + int(c->inputs_[0].size()), + int(c->inputs_[1].size()), + int(expanded0.size()), + int(expanded1.size())); + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + } + } + } + + if (false) { + Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", + level, + EscapeString(smallest.Encode()).c_str(), + EscapeString(largest.Encode()).c_str()); + } + + // Update the place where we will do the next compaction for this level. + // We update this immediately instead of waiting for the VersionEdit + // to be applied so that if the compaction fails, we will try a different + // key range next time. + compact_pointer_[level] = largest.Encode().ToString(); + c->edit_.SetCompactPointer(level, largest); + + return c; +} + +Compaction* VersionSet::CompactRange( + int level, + const InternalKey& begin, + const InternalKey& end) { + std::vector inputs; + GetOverlappingInputs(level, begin, end, &inputs); + if (inputs.empty()) { + return NULL; + } + + Compaction* c = new Compaction(level); + c->input_version_ = current_; + c->input_version_->Ref(); + c->inputs_[0] = inputs; + + // Find the range we are compacting + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + + GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + if (false) { + Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", + level, + EscapeString(smallest.Encode()).c_str(), + EscapeString(largest.Encode()).c_str()); + } + return c; +} + +Compaction::Compaction(int level) + : level_(level), + max_output_file_size_(MaxFileSizeForLevel(level)), + input_version_(NULL) { + for (int i = 0; i < config::kNumLevels; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + if (input_version_ != NULL) { + input_version_->Unref(); + } +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (int i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +void Compaction::ReleaseInputs() { + if (input_version_ != NULL) { + input_version_->Unref(); + input_version_ = NULL; + } +} + +} diff --git a/db/version_set.h b/db/version_set.h new file mode 100644 index 0000000..b8eee3d --- /dev/null +++ b/db/version_set.h @@ -0,0 +1,290 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of Table files per level. The +// entire set of versions is maintained in a VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ +#define STORAGE_LEVELDB_DB_VERSION_SET_H_ + +#include +#include +#include +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/port.h" + +namespace leveldb { + +// Grouping of constants. We may want to make some of these +// parameters set via options. +namespace config { +static const int kNumLevels = 7; +} + +namespace log { class Writer; } + +class Compaction; +class Iterator; +class MemTable; +class TableBuilder; +class TableCache; +class Version; +class VersionSet; +class WritableFile; + +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, std::vector* iters); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + void Unref(); + + // Return a human readable string that describes this version's contents. + std::string DebugString() const; + + private: + friend class Compaction; + friend class VersionSet; + + class LevelFileNumIterator; + Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; + + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + int refs_; // Number of live refs to this version + MemTable* cleanup_mem_; // NULL, or table to delete when version dropped + + // List of files per level + std::vector files_[config::kNumLevels]; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + double compaction_score_; + int compaction_level_; + + explicit Version(VersionSet* vset) + : vset_(vset), next_(NULL), refs_(0), + cleanup_mem_(NULL), + compaction_score_(-1), + compaction_level_(-1) { + } + + ~Version(); + + // No copying allowed + Version(const Version&); + void operator=(const Version&); +}; + +class VersionSet { + public: + VersionSet(const std::string& dbname, + const Options* options, + TableCache* table_cache, + const InternalKeyComparator*); + ~VersionSet(); + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Iff Apply() returns OK, arrange to delete + // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed + // by older versions. + Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); + + // Recover the last saved descriptor from persistent storage. + Status Recover(uint64_t* log_number, SequenceNumber* last_sequence); + + // Save current contents to *log + Status WriteSnapshot(log::Writer* log); + + // Return the current version. + Version* current() const { return current_; } + + // Return the current manifest file number + uint64_t ManifestFileNumber() const { return manifest_file_number_; } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_++; } + + // Return the number of Table files at the specified level. + int NumLevelFiles(int level) const; + + // Pick level and inputs for a new compaction. + // Returns NULL if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + Compaction* PickCompaction(); + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns NULL if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + Compaction* CompactRange( + int level, + const InternalKey& begin, + const InternalKey& end); + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + Iterator* MakeInputIterator(Compaction* c); + + // Returns true iff some level needs a compaction. + bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } + + // Add all files listed in any live version to *live. + // May also mutate some internal state. + void AddLiveFiles(std::set* live); + + // Return the approximate offset in the database of the data for + // "key" as of version "v". + uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); + + // Register a reference to a large value with the specified + // large_ref from the specified file number. Returns "true" if this + // is the first recorded reference to the "large_ref" value in the + // database, and false otherwise. + bool RegisterLargeValueRef(const LargeValueRef& large_ref, + uint64_t filenum, + const InternalKey& internal_key); + + // Cleanup the large value reference state by eliminating any + // references from files that are not includes in either "live_tables" + // or "log_file". + void CleanupLargeValueRefs(const std::set& live_tables, + uint64_t log_file_num); + + // Returns true if a large value with the given reference is live. + bool LargeValueIsLive(const LargeValueRef& large_ref); + + private: + class Builder; + + friend class Compaction; + friend class Version; + + Status Finalize(Version* v); + + // Delete any old versions that are no longer needed. + void MaybeDeleteOldVersions(); + + struct BySmallestKey; + Status SortLevel(Version* v, uint64_t level); + + void GetOverlappingInputs( + int level, + const InternalKey& begin, + const InternalKey& end, + std::vector* inputs); + + void GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest); + + Env* const env_; + const std::string dbname_; + const Options* const options_; + TableCache* const table_cache_; + const InternalKeyComparator icmp_; + uint64_t next_file_number_; + uint64_t manifest_file_number_; + + // Opened lazily + WritableFile* descriptor_file_; + log::Writer* descriptor_log_; + + // Versions are kept in a singly linked list that is never empty + Version* current_; // Pointer to the last (newest) list entry + Version* oldest_; // Pointer to the first (oldest) list entry + + // Map from large value reference to the set of + // values containing references to the value. We keep the + // internal key as a std::string rather than as an InternalKey because + // we want to be able to easily use a set. + typedef std::set > LargeReferencesSet; + typedef std::map LargeValueMap; + LargeValueMap large_value_refs_; + + // Per-level key at which the next compaction at that level should start. + // Either an empty string, or a valid InternalKey. + std::string compact_pointer_[config::kNumLevels]; + + // No copying allowed + VersionSet(const VersionSet&); + void operator=(const VersionSet&); +}; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // and "level+1" will be merged to produce a set of "level+1" files. + int level() const { return level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + private: + friend class Version; + friend class VersionSet; + + explicit Compaction(int level); + + int level_; + uint64_t max_output_file_size_; + Version* input_version_; + VersionEdit edit_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State for implementing IsBaseLevelForKey + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + int level_ptrs_[config::kNumLevels]; +}; + +} + +#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/db/write_batch.cc b/db/write_batch.cc new file mode 100644 index 0000000..b6c4979 --- /dev/null +++ b/db/write_batch.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring | +// kTypeLargeValueRef varstring varstring | +// kTypeDeletion varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "include/write_batch.h" + +#include "include/db.h" +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "util/coding.h" + +namespace leveldb { + +WriteBatch::WriteBatch() { + Clear(); +} + +WriteBatch::~WriteBatch() { } + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(12); +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +void WriteBatch::Put(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeValue)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatchInternal::PutLargeValueRef(WriteBatch* b, + const Slice& key, + const LargeValueRef& large_ref) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + b->rep_.push_back(static_cast(kTypeLargeValueRef)); + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, + Slice(large_ref.data, sizeof(large_ref.data))); +} + +void WriteBatch::Delete(const Slice& key) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeDeletion)); + PutLengthPrefixedSlice(&rep_, key); +} + +Status WriteBatchInternal::InsertInto(const WriteBatch* b, + MemTable* memtable) { + const int count = WriteBatchInternal::Count(b); + int found = 0; + Iterator it(*b); + for (; !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeDeletion: + memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); + break; + case kTypeValue: + memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); + break; + case kTypeLargeValueRef: + memtable->Add(it.sequence_number(), kTypeLargeValueRef, + it.key(), it.value()); + break; + } + found++; + } + if (!it.status().ok()) { + return it.status(); + } else if (found != count) { + return Status::Corruption("wrong count in WriteBatch"); + } + return Status::OK(); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= 12); + b->rep_.assign(contents.data(), contents.size()); +} + +WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) + : input_(WriteBatchInternal::Contents(&batch)), + done_(false) { + if (input_.size() < 12) { + done_ = true; + } else { + seq_ = WriteBatchInternal::Sequence(&batch), + input_.remove_prefix(12); + GetNextEntry(); + } +} + +void WriteBatchInternal::Iterator::Next() { + assert(!done_); + seq_++; + GetNextEntry(); +} + +void WriteBatchInternal::Iterator::GetNextEntry() { + if (input_.empty()) { + done_ = true; + return; + } + char tag = input_[0]; + input_.remove_prefix(1); + switch (tag) { + case kTypeValue: + case kTypeLargeValueRef: + if (GetLengthPrefixedSlice(&input_, &key_) && + GetLengthPrefixedSlice(&input_, &value_)) { + op_ = static_cast(tag); + } else { + status_ = Status::Corruption("bad WriteBatch Put"); + done_ = true; + input_.clear(); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input_, &key_)) { + op_ = kTypeDeletion; + } else { + status_ = Status::Corruption("bad WriteBatch Delete"); + done_ = true; + input_.clear(); + } + break; + default: + status_ = Status::Corruption("unknown WriteBatch tag"); + done_ = true; + input_.clear(); + break; + } +} + +} diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h new file mode 100644 index 0000000..df750c7 --- /dev/null +++ b/db/write_batch_internal.h @@ -0,0 +1,73 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ +#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ + +#include "include/write_batch.h" + +namespace leveldb { + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + static void PutLargeValueRef(WriteBatch* batch, + const Slice& key, + const LargeValueRef& large_ref); + + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + // Return the seqeunce number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the seqeunce number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + static Status InsertInto(const WriteBatch* batch, MemTable* memtable); + + // Iterate over the contents of a write batch. + class Iterator { + public: + explicit Iterator(const WriteBatch& batch); + bool Done() const { return done_; } + void Next(); + ValueType op() const { return op_; } + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + SequenceNumber sequence_number() const { return seq_; } + Status status() const { return status_; } + + private: + void GetNextEntry(); + + Slice input_; + bool done_; + ValueType op_; + Slice key_; + Slice value_; + SequenceNumber seq_; + Status status_; + }; +}; + +} + + +#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc new file mode 100644 index 0000000..4963579 --- /dev/null +++ b/db/write_batch_test.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/db.h" + +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "include/env.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +static std::string PrintContents(WriteBatch* b) { + InternalKeyComparator cmp(BytewiseComparator()); + MemTable mem(cmp); + std::string state; + Status s = WriteBatchInternal::InsertInto(b, &mem); + Iterator* iter = mem.NewIterator(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + break; + case kTypeLargeValueRef: + state.append("PutRef("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append("ParseError()"); + } + return state; +} + +class WriteBatchTest { }; + +TEST(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); +} + +TEST(WriteBatchTest, Multiple) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + batch.Put(Slice("baz"), Slice("boo")); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@102" + "Delete(box)@101" + "Put(foo, bar)@100", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, PutIndirect) { + WriteBatch batch; + batch.Put(Slice("baz"), Slice("boo")); + LargeValueRef h; + for (int i = 0; i < LargeValueRef::ByteSize(); i++) { + h.data[i] = (i < 20) ? 'a' : 'b'; + } + WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(2, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@100" + "PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, Corruption) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + WriteBatchInternal::SetContents(&batch, + Slice(contents.data(),contents.size()-1)); + ASSERT_EQ("Put(foo, bar)@200" + "ParseError()", + PrintContents(&batch)); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/doc/doc.css b/doc/doc.css new file mode 100644 index 0000000..700c564 --- /dev/null +++ b/doc/doc.css @@ -0,0 +1,89 @@ +body { + margin-left: 0.5in; + margin-right: 0.5in; + background: white; + color: black; +} + +h1 { + margin-left: -0.2in; + font-size: 14pt; +} +h2 { + margin-left: -0in; + font-size: 12pt; +} +h3 { + margin-left: -0in; +} +h4 { + margin-left: -0in; +} +hr { + margin-left: -0in; +} + +/* Definition lists: definition term bold */ +dt { + font-weight: bold; +} + +address { + text-align: center; +} +code,samp,var { + color: blue; +} +kbd { + color: #600000; +} +div.note p { + float: right; + width: 3in; + margin-right: 0%; + padding: 1px; + border: 2px solid #6060a0; + background-color: #fffff0; +} + +ul { + margin-top: -0em; + margin-bottom: -0em; +} + +ol { + margin-top: -0em; + margin-bottom: -0em; +} + +UL.nobullets { + list-style-type: none; + list-style-image: none; + margin-left: -1em; +} + +p { + margin: 1em 0 1em 0; + padding: 0 0 0 0; +} + +pre { + line-height: 1.3em; + padding: 0.4em 0 0.8em 0; + margin: 0 0 0 0; + border: 0 0 0 0; + color: blue; +} + +.datatable { + margin-left: auto; + margin-right: auto; + margin-top: 2em; + margin-bottom: 2em; + border: 1px solid; +} + +.datatable td,th { + padding: 0 0.5em 0 0.5em; + text-align: right; +} diff --git a/doc/impl.html b/doc/impl.html new file mode 100644 index 0000000..2f2c809 --- /dev/null +++ b/doc/impl.html @@ -0,0 +1,222 @@ + + + + +Leveldb file layout and compactions + + + + +

Files

+ +The implementation of leveldb is similar in spirit to the +representation of a single + +Bigtable tablet (section 5.3). +However the organization of the files that make up the representation +is somewhat different and is explained below. + +

+Each database is represented by a set of file stored in a directory. +There are several different types of files as documented below: +

+

Log files

+

+A log file (*.log) stores a sequence of recent updates. Each update +is appended to the current log file. When the log file reaches a +pre-determined size (approximately 1MB by default), it is converted +to a sorted table (see below) and a new log file is created for future +updates. +

+A copy of the current log file is kept in an in-memory structure (the +memtable). This copy is consulted on every read so that read +operations reflect all logged updates. +

+

Sorted tables

+

+A sorted table (*.sst) stores a sequence of entries sorted by key. +Each entry is either a value for the key, or a deletion marker for the +key. (Deletion markers are kept around to hide obsolete values +present in older sorted tables). +

+The set of sorted tables are organized into a sequence of levels. The +sorted table generated from a log file is placed in a special young +level (also called level-0). When the number of young files exceeds a +certain threshold (currently four), all of the young files are merged +together with all of the overlapping level-1 files to produce a +sequence of new level-1 files (we create a new level-1 file for every +2MB of data.) +

+Files in the young level may contain overlapping keys. However files +in other levels have distinct non-overlapping key ranges. Consider +level number L where L >= 1. When the combined size of files in +level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, +...), one file in level-L, and all of the overlapping files in +level-(L+1) are merged to form a set of new files for level-(L+1). +These merges have the effect of gradually migrating new updates from +the young level to the largest level using only bulk reads and writes +(i.e., minimizing expensive seeks). + +

Large value files

+

+Each large value (greater than 64KB by default) is placed in a large +value file (*.val) of its own. An entry is maintained in the log +and/or sorted tables that maps from the corresponding key to the +name of this large value file. The name of the large value file +is derived from a SHA1 hash of the value and its length so that +identical values share the same file. +

+

Manifest

+

+A MANIFEST file lists the set of sorted tables that make up each +level, the corresponding key ranges, and other important metadata. +A new MANIFEST file (with a new number embedded in the file name) +is created whenever the database is reopened. The MANIFEST file is +formatted as a log, and changes made to the serving state (as files +are added or removed) are appended to this log. +

+

Current

+

+CURRENT is a simple text file that contains the name of the latest +MANIFEST file. +

+

Info logs

+

+Informational messages are printed to files named LOG and LOG.old. +

+

Others

+

+Other files used for miscellaneous purposes may also be present +(LOCK, *.dbtmp). + +

Level 0

+When the log file grows above a certain size (1MB by default): +
    +
  • Write the contents of the current memtable to an sstable +
  • Replace the current memtable by a brand new empty memtable +
  • Switch to a new log file +
  • Delete the old log file and the old memtable +
+Experimental measurements show that generating an sstable from a 1MB +log file takes ~12ms, which seems like an acceptable latency hiccup to +add infrequently to a log write. + +

+The new sstable is added to a special level-0 level. level-0 contains +a set of files (up to 4 by default). However unlike other levels, +these files do not cover disjoint ranges, but may overlap each other. + +

Compactions

+ +

+When the size of level L exceeds its limit, we compact it in a +background thread. The compaction picks a file from level L and all +overlapping files from the next level L+1. Note that if a level-L +file overlaps only part of a level-(L+1) file, the entire file at +level-(L+1) is used as an input to the compaction and will be +discarded after the compaction. Aside: because level-0 is special +(files in it may overlap each other), we treat compactions from +level-0 to level-1 specially: a level-0 compaction may pick more than +one level-0 file in case some of these files overlap each other. + +

+A compaction merges the contents of the picked files to produce a +sequence of level-(L+1) files. We switch to producing a new +level-(L+1) file after the current output file has reached the target +file size (2MB). The old files are discarded and the new files are +added to the serving state. + +

+Compactions for a particular level rotate through the key space. In +more detail, for each level L, we remember the ending key of the last +compaction at level L. The next compaction for level L will pick the +first file that starts after this key (wrapping around to the +beginning of the key space if there is no such file). + +

+Compactions drop overwritten values. They also drop deletion markers +if there are no higher numbered levels that contain a file whose range +overlaps the current key. + +

Timing

+ +Level-0 compactions will read up to four 1MB files from level-0, and +at worst all the level-1 files (10MB). I.e., we will read 14MB and +write 14MB. + +

+Other than the special level-0 compactions, we will pick one 2MB file +from level L. In the worst case, this will overlap ~ 12 files from +level L+1 (10 because level-(L+1) is ten times the size of level-L, +and another two at the boundaries since the file ranges at level-L +will usually not be aligned with the file ranges at level-L+1). The +compaction will therefore read 26MB and write 26MB. Assuming a disk +IO rate of 100MB/s (ballpark range for modern drives), the worst +compaction cost will be approximately 0.5 second. + +

+If we throttle the background writing to something small, say 10% of +the full 100MB/s speed, a compaction may take up to 5 seconds. If the +user is writing at 10MB/s, we might build up lots of level-0 files +(~50 to hold the 5*10MB). This may signficantly increase the cost of +reads due to the overhead of merging more files together on every +read. + +

+Solution 1: To reduce this problem, we might want to increase the log +switching threshold when the number of level-0 files is large. Though +the downside is that the larger this threshold, the larger the delay +that we will add to write latency when a write triggers a log switch. + +

+Solution 2: We might want to decrease write rate artificially when the +number of level-0 files goes up. + +

+Solution 3: We work on reducing the cost of very wide merges. +Perhaps most of the level-0 files will have their blocks sitting +uncompressed in the cache and we will only need to worry about the +O(N) complexity in the merging iterator. + +

Number of files

+ +Instead of always making 2MB files, we could make larger files for +larger levels to reduce the total file count, though at the expense of +more bursty compactions. Alternatively, we could shard the set of +files into multiple directories. + +

+An experiment on an ext3 filesystem on Feb 04, 2011 shows +the following timings to do 100K file opens in directories with +varying number of files: + + + + + +
Files in directoryMicroseconds to open a file
10009
1000010
10000016
+So maybe even the sharding is not necessary on modern filesystems? + +

Recovery

+ +
    +
  • Read CURRENT to find name of the latest committed MANIFEST +
  • Read the named MANIFEST file +
  • Clean up stale files +
  • We could open all sstables here, but it is probably better to be lazy... +
  • Convert log chunk to a new level-0 sstable +
  • Start directing new writes to a new log file with recovered sequence# +
+ +

Garbage collection of files

+ +DeleteObsoleteFiles() is called at the end of every +compaction and at the end of recovery. It finds the names of all +files in the database. It deletes all log files that are not the +current log file. It deletes all table files that are not referenced +from some level and are not the output of an active compaction. It +deletes all large value files that are not referenced from any live +table or log file. + + + diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 0000000..53471d2 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,508 @@ + + + + +Leveldb + + + +

Leveldb

+
Jeff Dean, Sanjay Ghemawat
+

+The leveldb library provides a persistent key value store. Keys and +values are arbitrary byte arrays. The keys are ordered within the key +value store according to a user-specified comparator function. + +

+

Opening A Database

+

+A leveldb database has a name which corresponds to a file system +directory. All of the contents of database are stored in this +directory. The following example shows how to open a database, +creating it if necessary: +

+

+  #include <assert>
+  #include "leveldb/include/db.h"
+
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+If you want to raise an error if the database already exists, add +the following line before the leveldb::DB::Open call: +
+  options.error_if_exists = true;
+
+

Status

+

+You may have noticed the leveldb::Status type above. Values of this +type are returned by most functions in leveldb that may encounter an +error. You can check if such a result is ok, and also print an +associated error message: +

+

+   leveldb::Status s = ...;
+   if (!s.ok()) cerr << s.ToString() << endl;
+
+

Closing A Database

+

+When you are done with a database, just delete the database object. +Example: +

+

+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+
+

Reads And Writes

+

+The database provides Put, Delete, and Get methods to +modify/query the database. For example, the following code +moves the value stored under key1 to key2. +

+

+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
+  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
+
+See important performance note below for how to +speed up writes significantly. + +

Atomic Updates

+

+Note that if the process dies after the Put of key2 but before the +delete of key1, the same value may be left stored under multiple keys. +Such problems can be avoided by using the WriteBatch class to +atomically apply a set of updates: +

+

+  #include "leveldb/include/write_batch.h"
+  ...
+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) {
+    leveldb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db->Write(leveldb::WriteOptions(), &batch);
+  }
+
+The WriteBatch holds a sequence of edits to be made to the database, +and these edits within the batch are applied in order. Note that we +called Delete before Put so that if key1 is identical to key2, +we do not end up erroneously dropping the value entirely. +

+Apart from its atomicity benefits, WriteBatch may also be used to +speed up bulk updates by placing lots of individual mutations into the +same batch. +

+

Concurrency

+

+A database may only be opened by one process at a time. The leveldb +implementation acquires a lock from the operating system to prevent +misuse. Within a single process, the same leveldb::DB object may +be safely used by multiple concurrent threads. +

+

Iteration

+

+The following example demonstrates how to print all key,value pairs +in a database. +

+

+  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+  delete it;
+
+The following variation shows how to process just the keys in the +range [start,limit): +

+

+  for (it->Seek(start);
+       it->Valid() && it->key().ToString() < limit;
+       it->Next()) {
+    ...
+  }
+
+You can also process entries in reverse order. (Caveat: reverse +iteration is currently a factor of two or three slower than forward +iteration.) +

+

+  for (it->SeekToLast(); it->Valid(); it->Prev()) {
+    ...
+  }
+
+

Snapshots

+

+Snapshots provide consistent read-only views over the entire state of +the key-value store. ReadOptions::snapshot may be non-NULL to indicate +that a read should operate on a particular version of the DB state. +If ReadOptions::snapshot is NULL, the read will operate on an +implicit snapshot of the current state. +

+Snapshots typically are created by the DB::GetSnapshot() method: +

+

+  leveldb::ReadOptions options;
+  options.snapshot = db->GetSnapshot();
+  ... apply some updates to db ...
+  leveldb::Iterator* iter = db->NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db->ReleaseSnapshot(options.snapshot);
+
+Note that when a snapshot is no longer needed, it should be released +using the DB::ReleaseSnapshot interface. This allows the +implementation to get rid of state that was being maintained just to +support reading as of that snapshot. +

+A Write operation can also return a snapshot that +represents the state of the database just after applying a particular +set of updates: +

+

+  leveldb::Snapshot* snapshot;
+  leveldb::WriteOptions write_options;
+  write_options.post_write_snapshot = &snapshot;
+  leveldb::Status status = db->Write(write_options, ...);
+  ... perform other mutations to db ...
+
+  leveldb::ReadOptions read_options;
+  read_options.snapshot = snapshot;
+  leveldb::Iterator* iter = db->NewIterator(read_options);
+  ... read as of the state just after the Write call returned ...
+  delete iter;
+
+  db->ReleaseSnapshot(snapshot);
+
+

Slice

+

+The return value of the it->key() and it->value() calls above +are instances of the leveldb::Slice type. Slice is a simple +structure that contains a length and a pointer to an external byte +array. Returning a Slice is a cheaper alternative to returning a +std::string since we do not need to copy potentially large keys and +values. In addition, leveldb methods do not return null-terminated +C-style strings since leveldb keys and values are allowed to +contain '\0' bytes. +

+C++ strings and null-terminated C-style strings can be easily converted +to a Slice: +

+

+   leveldb::Slice s1 = "hello";
+
+   std::string str("world");
+   leveldb::Slice s2 = str;
+
+A Slice can be easily converted back to a C++ string: +
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+
+Be careful when using Slices since it is up to the caller to ensure that +the external byte array into which the Slice points remains live while +the Slice is in use. For example, the following is buggy: +

+

+   leveldb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+
+When the if statement goes out of scope, str will be destroyed and the +backing storage for slice will disappear. +

+

Comparators

+

+The preceding examples used the default ordering function for key, +which orders bytes lexicographically. You can however supply a custom +comparator when opening a database. For example, suppose each +database key consists of two numbers and we should sort by the first +number, breaking ties by the second number. First, define a proper +subclass of leveldb::Comparator that expresses these rules: +

+

+  class TwoPartComparator : public leveldb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a < b: negative result
+    //   if a > b: positive result
+    //   else: zero result
+    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &a1, &a2);
+      ParseKey(b, &b1, &b2);
+      if (a1 < b1) return -1;
+      if (a1 > b1) return +1;
+      if (a2 < b2) return -1;
+      if (a2 > b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+
+Now create a database using this custom comparator: +

+

+  TwoPartComparator cmp;
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &cmp;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  ...
+
+

Backwards compatibility

+

+The result of the comparator's Name method is attached to the +database when it is created, and is checked on every subsequent +database open. If the name changes, the leveldb::DB::Open call will +fail. Therefore, change the name if and only if the new key format +and comparison function are incompatible with existing databases, and +it is ok to discard the contents of all existing databases. +

+You can however still gradually evolve your key format over time with +a little bit of pre-planning. For example, you could store a version +number at the end of each key (one byte should suffice for most uses). +When you wish to switch to a new key format (e.g., adding an optional +third part to the keys processed by TwoPartComparator), +(a) keep the same comparator name (b) increment the version number +for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. +

+

Performance

+

+Performance can be tuned by changing the default values of the +types defined in leveldb/include/options.h. + +

+

Asynchronous Writes

+ +By default, each write to leveldb is synchronous: it does +not return until the write has been pushed from memory to persistent +storage. (On Posix systems, this is implemented by calling either +fdatasync(...) or msync(..., MS_SYNC).) +Synchronous writes may be very slow and the synchrony can be +optionally disabled: +
+  leveldb::WriteOptions write_options;
+  write_options.sync = false;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a hundred times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can be particularly beneficial when loading a +large amount of data into the database since you can mitigate the +problem of lost updates by restarting the bulk load. A hybrid scheme +is also possible where every Nth write is synchronous, and in the +event of a crash, the bulk load is restarted just after the last +synchronous write finished by the previous run. + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write. The extra cost of the +synchronous write will be amortized across all of the writes in the batch. + +

+

Block size

+

+leveldb groups adjacent keys together into the same block and such a +block is the unit of transfer to and from persistent storage. The +default block size is approximately 8192 uncompressed bytes. +Applications that mostly do bulk scans over the contents of the +database may wish to increase this size. Applications that do a lot +of point reads of small values may wish to switch to a smaller block +size if performance measurements indicate an improvement. There isn't +much benefit in using blocks smaller than one kilobyte, or larger than +a few megabytes. Also note that compression will be more effective +with larger block sizes. +

+

Compression

+

+Each block is individually compressed before being written to +persistent storage. Compression is on by default since the default +compression method is very fast, and is automatically disabled for +uncompressible data. In rare cases, applications may want to disable +compression entirely, but should only do so if benchmarks show a +performance improvement: +

+

+  leveldb::Options options;
+  options.compression = leveldb::kNoCompression;
+  ... leveldb::DB::Open(options, name, ...) ....
+
+

Cache

+

+The contents of the database are stored in a set of files in the +filesystem and each file stores a sequence of compressed blocks. If +options.cache is non-NULL, it is used to cache frequently used +uncompressed block contents. +

+

+  #include "leveldb/include/cache.h"
+
+  leveldb::Options options;
+  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
+  leveldb::DB* db;
+  leveldb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.cache;
+
+Note that the cache holds uncompressed data, and therefore it should +be sized according to application level data sizes, without any +reduction from compression. (Caching of compressed blocks is left to +the operating system buffer cache, or any custom Env +implementation provided by the client.) +

+When performing a bulk read, the application may wish to disable +caching so that the data processed by the bulk read does not end up +displacing most of the cached contents. A per-iterator option can be +used to achieve this: +

+

+  leveldb::ReadOptions options;
+  options.fill_cache = false;
+  leveldb::Iterator* it = db->NewIterator(options);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ...
+  }
+
+

Key Layout

+

+Note that the unit of disk transfer and caching is a block. Adjacent +keys (according to the database sort order) will usually be placed in +the same block. Therefore the application can improve its performance +by placing keys that are accessed together near each other and placing +infrequently used keys in a separate region of the key space. +

+For example, suppose we are implementing a simple file system on top +of leveldb. The types of entries we might wish to store are: +

+

+   filename -> permission-bits, length, list of file_block_ids
+   file_block_id -> data
+
+We might want to prefix filename keys with one letter (say '/') and the +file_block_id keys with a different letter (say '0') so that scans +over just the metadata do not force us to fetch and cache bulky file +contents. +

+

Large Values

+

+leveldb has special treatment of large values (by default, a value +of length greater than or equal to 64K is considered large, though a +field in Options can be used to adjust this threshold). Each such +large value is placed in a separate operating system file, and the +normal database blocks just contain pointers to such files. +

+Furthermore, if the same large value occurs multiple times in a single +database, it will be stored just once. +

+

Checksums

+

+leveldb associates checksums with all data it stores in the file system. +There are two separate controls provided over how aggressively these +checksums are verified: +

+

    +
  • ReadOptions::verify_checksums may be set to true to force + checksum verification of all data that is read from the file system on + behalf of a particular read. By default, no such verification is + done. +

    +

  • Options::paranoid_checks may be set to true before opening a + database to make the database implementation raise an error as soon as + it detects an internal corruption. Depending on which portion of the + database has been corrupted, the error may be raised when the database + is opened, or later by another database operation. By default, + paranoid checking is off so that the database can be used even if + parts of its persistent storage have been corrupted. +

    + If a database is corrupted (perhaps it cannot be opened when + paranoid checking is turned on), the leveldb::RepairDB function + may be used to recover as much of the data as possible +

    +

+

Approximate Sizes

+

+The GetApproximateSizes method can used to get the approximate +number of bytes of file system space used by one or more key ranges. +

+

+   leveldb::Range ranges[2];
+   ranges[0] = leveldb::Range("a", "c");
+   ranges[1] = leveldb::Range("x", "z");
+   uint64_t sizes[2];
+   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
+
+The preceding call will set sizes[0] to the approximate number of +bytes of file system space used by the key range [a..c) and +sizes[1] to the approximate number of bytes used by the key range +[x..z). +

+

Environment

+

+All file operations (and other operating system calls) issued by the +leveldb implementation are routed through a leveldb::Env object. +Sophisticated clients may wish to provide their own Env +implementation to get better control. For example, an application may +introduce artificial delays in the file IO paths to limit the impact +of leveldb on other activities in the system. +

+

+  class SlowEnv : public leveldb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  leveldb::Options options;
+  options.env = &env;
+  Status s = leveldb::DB::Open(options, ...);
+
+

Porting

+

+leveldb may be ported to a new platform by providing platform +specific implementations of the types/methods/functions exported by +leveldb/port/port.h. See leveldb/port/port_example.h for more +details. +

+In addition, the new platform may need a new default leveldb::Env +implementation. See leveldb/util/env_posix.h for an example. + +

Other Information

+ +

+Details about the leveldb implementation may be found in +the following documents: +

+ + + diff --git a/doc/log_format.txt b/doc/log_format.txt new file mode 100644 index 0000000..9a801d4 --- /dev/null +++ b/doc/log_format.txt @@ -0,0 +1,72 @@ +The log file contents are a sequence of 32KB blocks. The only +exception is that the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] + length: uint16 + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last seven bytes of a block. Any +leftover bytes here form the trailer, which must consist entirely of +zero bytes and must be skipped by readers. In particular, even if +there are exactly seven bytes left in the block, and a zero-length +user record is added (which will fit in these seven bytes), the writer +must skip these trailer bytes and add the record to the next block. + +More types may be added in the future. Some Readers may skip record +types they do not understand, others may report that some data was +skipped. + +FULL == 1 +FIRST == 2 +MIDDLE == 3 +LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been +split into multiple fragments (typically because of block boundaries). +FIRST is the type of the first fragment of a user record, LAST is the +type of the last fragment of a user record, and MID is the type of all +interior fragments of a user record. + +Example: consider a sequence of user records: + A: length 1000 + B: length 97270 + C: length 8000 +A will be stored as a FULL record in the first block. + +B will be split into three fragments: first fragment occupies the rest +of the first block, second fragment occupies the entirety of the +second block, and the third fragment occupies a prefix of the third +block. This will leave six bytes free in the third block, which will +be left empty as the trailer. + +C will be stored as a FULL record in the fourth block. + +=================== + +Some benefits over the recordio format: + +(1) We do not need any heuristics for resyncing - just go to next +block boundary and scan. If there is a corruption, skip to the next +block. As a side-benefit, we do not get confused when part of the +contents of one log file are embedded as a record inside another log +file. + +(2) Splitting at approximate boundaries (e.g., for mapreduce) is +simple: find the next block boundary and skip records until we +hit a FULL or FIRST record. + +(3) We do not need extra buffering for large records. + +Some downsides compared to recordio format: + +(1) No packing of tiny records. This could be fixed by adding a new +record type, so it is a shortcoming of the current implementation, +not necessarily the format. + +(2) No compression. Again, this could be fixed by adding new record types. diff --git a/doc/table_format.txt b/doc/table_format.txt new file mode 100644 index 0000000..ad5aa4b --- /dev/null +++ b/doc/table_format.txt @@ -0,0 +1,61 @@ +File format +=========== + + + [data block 1] + [data block 2] + ... + [data block N] + [meta block 1] + ... + [meta block K] + [metaindex block] + [index block] + [Footer] (fixed size; starts at file_size - sizeof(Footer)) + + +The file contains internal pointers. Each such pointer is called +a BlockHandle and contains the following information: + offset: varint64 + size: varint64 + +(1) The sequence of key/value pairs in the file are stored in sorted +order and partitioned into a sequence of data blocks. These blocks +come one after another at the beginning of the file. Each data block +is formatted according to the code in block_builder.cc, and then +optionally compressed. + +(2) After the data blocks we store a bunch of meta blocks. The +supported meta block types are described below. More meta block types +may be added in the future. Each meta block is again formatted using +block_builder.cc and then optionally compressed. + +(3) A "metaindex" block. It contains one entry for every other meta +block where the key is the name of the meta block and the value is a +BlockHandle pointing to that meta block. + +(4) An "index" block. This block contains one entry per data block, +where the key is a string >= last key in that data block and before +the first key in the successive data block. The value is the +BlockHandle for the data block. + +(6) At the very end of the file is a fixed length footer that contains +the BlockHandle of the metaindex and index blocks as well as a magic number. + metaindex_handle: char[p]; // Block handle for metaindex + index_handle: char[q]; // Block handle for index + padding: char[40-p-q]; // 0 bytes to make fixed length + // (40==2*BlockHandle::kMaxEncodedLength) + magic: fixed64; // == 0xdb4775248b80fb57 + +"stats" Meta Block +------------------ + +This meta block contains a bunch of stats. The key is the name +of the statistic. The value contains the statistic. +TODO(postrelease): record following stats. + data size + index size + key size (uncompressed) + value size (uncompressed) + number of entries + number of data blocks diff --git a/include/cache.h b/include/cache.h new file mode 100644 index 0000000..6c98cb8 --- /dev/null +++ b/include/cache.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ +#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ + +#include +#include "include/slice.h" + +namespace leveldb { + +class Cache; + +// Create a new cache with a fixed size capacity. This implementation +// of Cache uses a least-recently-used eviction policy. +extern Cache* NewLRUCache(size_t capacity); + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) = 0; + + // If the cache has no mapping for "key", returns NULL. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + private: + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; + + // No copying allowed + Cache(const Cache&); + void operator=(const Cache&); +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/include/comparator.h b/include/comparator.h new file mode 100644 index 0000000..4e00e4d --- /dev/null +++ b/include/comparator.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ + +#include + +namespace leveldb { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. +class Comparator { + public: + virtual ~Comparator(); + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "leveldb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/include/db.h b/include/db.h new file mode 100644 index 0000000..c4d152d --- /dev/null +++ b/include/db.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ +#define STORAGE_LEVELDB_INCLUDE_DB_H_ + +#include +#include +#include "include/iterator.h" +#include "include/options.h" + +namespace leveldb { + +struct Options; +struct ReadOptions; +struct WriteOptions; + +class Snapshot; +class WriteBatch; + +// Some internal types. Clients should ignore. +class WriteBatchInternal; + +struct Range { + Slice start; + Slice limit; + + Range(const Slice& s, const Slice& l) : start(s), limit(l) { } +}; + +// A DB is a persistent ordered map from keys to values. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores NULL in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, + const std::string& name, + DB** dbptr); + + DB() { } + virtual ~DB(); + + // Set the database entry for "key" to "value". Returns OK on success, + // and a non-OK status on error. + // Note: consider setting options.sync = false. + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = false. + virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; + + // Apply the specified updates to the database. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = false. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) = 0; + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // DB implementations can export properties about their state + // via this method. If "property" is a valid property understood by this + // DB implementation, fills "*value" with its current value and returns + // true. Otherwise returns false. + // + // + // Valid property names include: + // + // "leveldb.num-files-at-level" - return the number of files at level , + // where is an ASCII representation of a level number (e.g. "0"). + virtual bool GetProperty(const Slice& property, uint64_t* value) = 0; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + // + // The results may not include the sizes of recently written data. + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) = 0; + + // Possible extensions: + // (1) Add a method to compact a range of keys + + private: + // No copying allowed + DB(const DB&); + void operator=(const DB&); +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options); + +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +Status RepairDB(const std::string& dbname, const Options& options); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/include/env.h b/include/env.h new file mode 100644 index 0000000..a728f29 --- /dev/null +++ b/include/env.h @@ -0,0 +1,293 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the leveldb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. + +#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ +#define STORAGE_LEVELDB_INCLUDE_ENV_H_ + +#include +#include +#include +#include +#include "include/status.h" + +namespace leveldb { + +class FileLock; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; + +class Env { + public: + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to leveldb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores NULL in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores NULL in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // Arrange to run "(*function)(arg)" once in a background thread. + // + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + virtual void Schedule( + void (*function)(void* arg), + void* arg) = 0; + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Write an entry to the log file with the specified format. + virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Return the length of this file in bytes. + virtual uint64_t Size() const = 0; + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). If an error was encountered, returns a + // non-OK status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() { } + virtual ~WritableFile(); + + virtual Status Append(const Slice& data) = 0; + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; + + private: + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + +// Log the specified data to *info_log if info_log is non-NULL. +extern void Log(Env* env, WritableFile* info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 3, 4))) +# endif + ; + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *target + explicit EnvWrapper(Env* target) : target_(target) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, SequentialFile** r) { + return target_->NewSequentialFile(f, r); + } + Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { + return target_->NewRandomAccessFile(f, r); + } + Status NewWritableFile(const std::string& f, WritableFile** r) { + return target_->NewWritableFile(f, r); + } + bool FileExists(const std::string& f) { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector* r) { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) { return target_->CreateDir(d); } + Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } + Status GetFileSize(const std::string& f, uint64_t* s) { + return target_->GetFileSize(f, s); + } + Status RenameFile(const std::string& s, const std::string& t) { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } + void Schedule(void (*f)(void*), void* a) { + return target_->Schedule(f, a); + } + void StartThread(void (*f)(void*), void* a) { + return target_->StartThread(f, a); + } + virtual Status GetTestDirectory(std::string* path) { + return target_->GetTestDirectory(path); + } + virtual void Logv(WritableFile* log, const char* format, va_list ap) { + return target_->Logv(log, format, ap); + } + uint64_t NowMicros() { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) { + target_->SleepForMicroseconds(micros); + } + private: + Env* target_; +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/include/iterator.h b/include/iterator.h new file mode 100644 index 0000000..b0872a3 --- /dev/null +++ b/include/iterator.h @@ -0,0 +1,95 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. + +#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ + +#include "include/slice.h" +#include "include/status.h" + +namespace leveldb { + +class Iterator { + public: + Iterator(); + virtual ~Iterator(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + + // No copying allowed + Iterator(const Iterator&); + void operator=(const Iterator&); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/include/options.h b/include/options.h new file mode 100644 index 0000000..1105570 --- /dev/null +++ b/include/options.h @@ -0,0 +1,203 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ +#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ + +#include + +namespace leveldb { + +class Cache; +class Comparator; +class Env; +class Snapshot; +class WritableFile; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kLightweightCompression = 0x1, +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be to written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + WritableFile* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory before converting to an + // on-disk file. + // + // Some DB operations may encounter a delay proportional to the size + // of this parameter. Therefore we recommend against increasing + // this parameter unless you are willing to live with an occasional + // slow operation in exchange for faster bulk loading throughput. + // + // Default: 1MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Handle values larger than "large_value_threshold" bytes + // specially, by writing them into their own files (to avoid + // compaction overhead) and doing content-based elimination of + // duplicate values to save space. + // + // We recommend against changing this value. + // + // Default: 64K + size_t large_value_threshold; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // Use the specified cache for blocks (if non-NULL). + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 8K + int block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kLightweightCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kLightweightCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kLightweightCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // Create an Options object with default values for all fields. + Options(); +}; + +// Options that control read operations +struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: false + bool verify_checksums; + + // Should the data read for this iteration be cached in memory? + // Callers may wish to set this field to false for bulk scans. + // Default: true + bool fill_cache; + + // If "snapshot" is non-NULL, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is NULL, use an impliicit + // snapshot of the state at the beginning of this read operation. + // Default: NULL + const Snapshot* snapshot; + + ReadOptions() + : verify_checksums(false), + fill_cache(true), + snapshot(NULL) { + } +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // Default: true + bool sync; + + // If "post_write_snapshot" is non-NULL, and the write succeeds, + // *post_write_snapshot will be modified to point to a snapshot of + // the DB state immediately after this write. The caller must call + // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the + // snapshot is no longer needed. + // + // If "post_write_snapshot" is non-NULL, and the write fails, + // *post_write_snapshot will be set to NULL. + // + // Default: NULL + const Snapshot** post_write_snapshot; + + WriteOptions() + : sync(true), + post_write_snapshot(NULL) { + } +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/include/slice.h b/include/slice.h new file mode 100644 index 0000000..62cb894 --- /dev/null +++ b/include/slice.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. + +#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ +#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ + +#include +#include +#include +#include + +namespace leveldb { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) { } + + // Create a slice that refers to data[0,n-1]. + Slice(const char* data, size_t n) : data_(data), size_(n) { } + + // Create a slice that refers to the contents of "s" + Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + Slice(const char* s) : data_(s), size_(strlen(s)) { } + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { data_ = ""; size_ = 0; } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + std::string ToString() const { return std::string(data_, size_); } + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_, x.data_, x.size_) == 0)); + } + + private: + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline int Slice::compare(const Slice& b) const { + const int min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +} + + +#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/include/status.h b/include/status.h new file mode 100644 index 0000000..cd148f6 --- /dev/null +++ b/include/status.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. + +#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ +#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ + +#include +#include +#include "include/slice.h" + +namespace leveldb { + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, Slice()); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + private: + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + }; + Code code() const { return (state_ == NULL) ? kOk : state_->first; } + + Status(Code code, const Slice& msg, const Slice& msg2); + + typedef std::pair State; + State* state_; +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); +} +inline void Status::operator=(const Status& s) { + if (this != &s) { + delete state_; + state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); + } +} + +} + +#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/include/table.h b/include/table.h new file mode 100644 index 0000000..96b2196 --- /dev/null +++ b/include/table.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ +#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ + +#include +#include "include/iterator.h" + +namespace leveldb { + +class Block; +class BlockHandle; +struct Options; +class RandomAccessFile; +struct ReadOptions; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. +class Table { + public: + // Attempt to open the table that is stored in "file", and read the + // metadata entries necessary to allow retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to NULL and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, + RandomAccessFile* file, + Table** table); + + ~Table(); + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + Iterator* NewIterator(const ReadOptions&) const; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key) const; + + private: + struct Rep; + Rep* rep_; + + explicit Table(Rep* rep) { rep_ = rep; } + static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); + + // No copying allowed + Table(const Table&); + void operator=(const Table&); +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ diff --git a/include/table_builder.h b/include/table_builder.h new file mode 100644 index 0000000..ecd852e --- /dev/null +++ b/include/table_builder.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). + +#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ +#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ + +#include +#include "include/options.h" +#include "include/status.h" + +namespace leveldb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; + +class TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + TableBuilder(const Options& options, WritableFile* file); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~TableBuilder(); + + // Change the options used by this builder. Note: only some of the + // option fields can be changed after construction. If a field is + // not allowed to change dynamically and its value in the structure + // passed to the constructor is different from its value in the + // structure passed to this method, this method will return an error + // without changing any fields. + Status ChangeOptions(const Options& options); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value); + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // Return non-ok iff some error has been detected. + Status status() const; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish(); + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon(); + + // Number of calls to Add() so far. + uint64_t NumEntries() const; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const; + + private: + bool ok() const { return status().ok(); } + void WriteBlock(BlockBuilder* block, BlockHandle* handle); + + struct Rep; + Rep* rep_; + + // No copying allowed + TableBuilder(const TableBuilder&); + void operator=(const TableBuilder&); +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/include/write_batch.h b/include/write_batch.h new file mode 100644 index 0000000..3411952 --- /dev/null +++ b/include/write_batch.h @@ -0,0 +1,49 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); + +#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ +#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ + +#include + +namespace leveldb { + +class Slice; + +class WriteBatch { + public: + WriteBatch(); + ~WriteBatch(); + + // Store the mapping "key->value" in the database. + void Put(const Slice& key, const Slice& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + void Delete(const Slice& key); + + // Clear all updates buffered in this batch. + void Clear(); + + private: + friend class WriteBatchInternal; + + std::string rep_; // See comment in write_batch.cc for the format of rep_ + + // Intentionally copyable +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/leveldb.gyp b/leveldb.gyp new file mode 100644 index 0000000..81dd523 --- /dev/null +++ b/leveldb.gyp @@ -0,0 +1,329 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +{ + 'variables': { + 'use_snappy%': 0, + }, + 'target_defaults': { + 'defines': [ + 'LEVELDB_PLATFORM_CHROMIUM=1', + ], + 'include_dirs': [ + # MOE:begin_strip + '../..', + # MOE:end_strip_and_replace '.', + ], + 'conditions': [ + ['OS == "win"', { + 'include_dirs': [ + 'port/win', + ], + }], + ['use_snappy', { + 'defines': [ + 'USE_SNAPPY=1', + ], + }], + ], + }, + 'targets': [ + { + 'target_name': 'leveldb', + 'type': '<(library)', + 'dependencies': [ + # The base libary is a lightweight abstraction layer for things like + # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ + # MOE:begin_strip + '../../../../base/base.gyp:base', + # MOE:end_strip_and_replace '../../base/base.gyp:base', + ], + 'conditions': [ + ['use_snappy', { + 'dependencies': [ + '../../../../third_party/snappy/snappy.gyp:snappy', + ], + }], + ], + 'sources': [ + # Include and then exclude so that all files show up in IDEs, even if + # they don't build. + 'db/builder.cc', + 'db/builder.h', + 'db/db_impl.cc', + 'db/db_impl.h', + 'db/db_iter.cc', + 'db/db_iter.h', + 'db/filename.cc', + 'db/filename.h', + 'db/dbformat.cc', + 'db/dbformat.h', + 'db/log_format.h', + 'db/log_reader.cc', + 'db/log_reader.h', + 'db/log_writer.cc', + 'db/log_writer.h', + 'db/memtable.cc', + 'db/memtable.h', + 'db/repair.cc', + 'db/skiplist.h', + 'db/snapshot.h', + 'db/table_cache.cc', + 'db/table_cache.h', + 'db/version_edit.cc', + 'db/version_edit.h', + 'db/version_set.cc', + 'db/version_set.h', + 'db/write_batch.cc', + 'db/write_batch_internal.h', + 'include/cache.h', + 'include/comparator.h', + 'include/db.h', + 'include/env.h', + 'include/iterator.h', + 'include/options.h', + 'include/slice.h', + 'include/status.h', + 'include/table.h', + 'include/table_builder.h', + 'include/write_batch.h', + 'port/port.h', + 'port/port_chromium.cc', + 'port/port_chromium.h', + 'port/port_example.h', + 'port/port_posix.cc', + 'port/port_posix.h', + 'port/sha1_portable.cc', + 'port/sha1_portable.h', + 'table/block.cc', + 'table/block.h', + 'table/block_builder.cc', + 'table/block_builder.h', + 'table/format.cc', + 'table/format.h', + 'table/iterator.cc', + 'table/iterator_wrapper.h', + 'table/merger.cc', + 'table/merger.h', + 'table/table.cc', + 'table/table_builder.cc', + 'table/two_level_iterator.cc', + 'table/two_level_iterator.h', + 'util/arena.cc', + 'util/arena.h', + 'util/cache.cc', + 'util/coding.cc', + 'util/coding.h', + 'util/comparator.cc', + 'util/crc32c.cc', + 'util/crc32c.h', + 'util/env.cc', + 'util/env_chromium.cc', + 'util/env_posix.cc', + 'util/hash.cc', + 'util/hash.h', + 'util/logging.cc', + 'util/logging.h', + 'util/mutexlock.h', + 'util/options.cc', + 'util/random.h', + 'util/status.cc', + ], + 'sources/': [ + ['exclude', '_(android|example|portable|posix)\\.cc$'], + ], + }, + { + 'target_name': 'leveldb_testutil', + 'type': '<(library)', + 'dependencies': [ + # MOE:begin_strip + '../../../../base/base.gyp:base', + # MOE:end_strip_and_replace '../../base/base.gyp:base', + 'leveldb', + ], + 'export_dependent_settings': [ + # The tests use include directories from these projects. + # MOE:begin_strip + '../../../../base/base.gyp:base', + # MOE:end_strip_and_replace '../../base/base.gyp:base', + 'leveldb', + ], + 'sources': [ + 'util/histogram.cc', + 'util/histogram.h', + 'util/testharness.cc', + 'util/testharness.h', + 'util/testutil.cc', + 'util/testutil.h', + ], + }, + { + 'target_name': 'leveldb_arena_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/arena_test.cc', + ], + }, + { + 'target_name': 'leveldb_cache_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/cache_test.cc', + ], + }, + { + 'target_name': 'leveldb_coding_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/coding_test.cc', + ], + }, + { + 'target_name': 'leveldb_corruption_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/corruption_test.cc', + ], + }, + { + 'target_name': 'leveldb_crc32c_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/crc32c_test.cc', + ], + }, + { + 'target_name': 'leveldb_db_bench', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/db_bench.cc', + ], + }, + { + 'target_name': 'leveldb_db_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/db_test.cc', + ], + }, + { + 'target_name': 'leveldb_dbformat_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/dbformat_test.cc', + ], + }, + { + 'target_name': 'leveldb_env_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/env_test.cc', + ], + }, + { + 'target_name': 'leveldb_filename_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/filename_test.cc', + ], + }, + { + 'target_name': 'leveldb_log_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/log_test.cc', + ], + }, + { + 'target_name': 'leveldb_sha1_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'port/sha1_test.cc', + ], + }, + { + 'target_name': 'leveldb_skiplist_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/skiplist_test.cc', + ], + }, + { + 'target_name': 'leveldb_table_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'table/table_test.cc', + ], + }, + { + 'target_name': 'leveldb_version_edit_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/version_edit_test.cc', + ], + }, + { + 'target_name': 'leveldb_write_batch_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/write_batch_test.cc', + ], + }, + ], +} + +# Local Variables: +# tab-width:2 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/port/README b/port/README new file mode 100644 index 0000000..422563e --- /dev/null +++ b/port/README @@ -0,0 +1,10 @@ +This directory contains interfaces and implementations that isolate the +rest of the package from platform details. + +Code in the rest of the package includes "port.h" from this directory. +"port.h" in turn includes a platform specific "port_.h" file +that provides the platform specific implementation. + +See port_posix.h for an example of what must be provided in a platform +specific header file. + diff --git a/port/port.h b/port/port.h new file mode 100644 index 0000000..816826b --- /dev/null +++ b/port/port.h @@ -0,0 +1,21 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_PORT_H_ +#define STORAGE_LEVELDB_PORT_PORT_H_ + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(LEVELDB_PLATFORM_POSIX) +# include "port/port_posix.h" +#elif defined(LEVELDB_PLATFORM_CHROMIUM) +# include "port/port_chromium.h" +#elif defined(LEVELDB_PLATFORM_ANDROID) +# include "port/port_android.h" +#endif + +#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_android.cc b/port/port_android.cc new file mode 100644 index 0000000..8a74111 --- /dev/null +++ b/port/port_android.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_android.h" + +#include + +extern "C" { +size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { + return fread(a, b, c, d); +} + +size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { + return fwrite(a, b, c, d); +} + +int fflush_unlocked(FILE *f) { + return fflush(f); +} + +int fdatasync(int fd) { + return fsync(fd); +} +} + +// TODO(gabor): This is copied from port_posix.cc - not sure if I should do this? +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { + PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); +} + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal(){ + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/port/port_android.h b/port/port_android.h new file mode 100644 index 0000000..2770a0c --- /dev/null +++ b/port/port_android.h @@ -0,0 +1,131 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ +#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ + +#include +#include +#include +#include +#include +#include +#include + +extern "C" { + size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); + size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); + int fflush_unlocked(FILE *f); + int fdatasync (int fd); +} + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { + //TODO(gabor): How can I implement this? + } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + Mutex* mu_; + pthread_cond_t cv_; +}; + +// Storage for a lock-free pointer +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +/** + * TODO(gabor): Implement actual compress + * This is a hack - it just copies input to output. + * No actual compression occurs. + */ +inline void Lightweight_Compress( + const char* input, + size_t input_length, + std::string* output) { + output->copy((char*)input,0,input_length); +} + +/** + * TODO(gabor): Implement actual compress + * This is a hack - it just copies input to output. + * No actual uncompression occurs. + */ +inline bool Lightweight_Uncompress( + const char* input_data, + size_t input_length, + std::string* output) { + output->copy((char*)input_data,0,input_length); + return (bool)1; +} + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + SHA1_CTX sha1_ctx; + SHA1Init(&sha1_ctx); + SHA1Update(&sha1_ctx, (const u_char*)data, len); + SHA1Final((u_char*)hash_array, &sha1_ctx); +} + +inline uint64_t ThreadIdentifier() { + pthread_t tid = pthread_self(); + uint64_t r = 0; + memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); + return r; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/port/port_chromium.cc b/port/port_chromium.cc new file mode 100644 index 0000000..c022ec4 --- /dev/null +++ b/port/port_chromium.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_chromium.h" + +#include "util/logging.h" + +#if defined(USE_SNAPPY) +# include "third_party/snappy/src/snappy.h" +# include "third_party/snappy/src/snappy-stubs.h" +#endif + +namespace leveldb { +namespace port { + +Mutex::Mutex() { +} + +Mutex::~Mutex() { +} + +void Mutex::Lock() { + mu_.Acquire(); +} + +void Mutex::Unlock() { + mu_.Release(); +} + +void Mutex::AssertHeld() { + mu_.AssertAcquired(); +} + +CondVar::CondVar(Mutex* mu) + : cv_(&mu->mu_) { +} + +CondVar::~CondVar() { } + +void CondVar::Wait() { + cv_.Wait(); +} + +void CondVar::Signal(){ + cv_.Signal(); +} + +void CondVar::SignalAll() { + cv_.Broadcast(); +} + +void Lightweight_Compress(const char* input, size_t input_length, + std::string* output) { +#if defined(USE_SNAPPY) + output->resize(snappy::MaxCompressedLength(input_length)); + size_t outlen; + snappy::RawCompress(snappy::StringPiece(input, input_length), + &(*output)[0], &outlen); + output->resize(outlen); +#else + output->assign(input, input_length); +#endif +} + +bool Lightweight_Uncompress(const char* input_data, size_t input_length, + std::string* output) { +#if defined(USE_SNAPPY) + snappy::StringPiece input(input_data, input_length); + size_t ulength; + if (!snappy::GetUncompressedLength(input, &ulength)) { + return false; + } + output->resize(ulength); + return snappy::RawUncompress(input, &(*output)[0]); +#else + output->assign(input_data, input_length); + return true; +#endif +} + +} +} diff --git a/port/port_chromium.h b/port/port_chromium.h new file mode 100644 index 0000000..b33bdde --- /dev/null +++ b/port/port_chromium.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ +#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ + +#include +#include +#include +#include "base/atomicops.h" +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/sha1.h" +#include "base/synchronization/condition_variable.h" +#include "base/synchronization/lock.h" + +// Linux's ThreadIdentifier() needs this. +#if defined(OS_LINUX) +# include +#endif + +#if defined(OS_WIN) +#define snprintf _snprintf +#define va_copy(a, b) do { (a) = (b); } while (0) +#endif + +namespace leveldb { +namespace port { + +// Chromium only supports little endian. +static const bool kLittleEndian = true; + +class Mutex { + public: + Mutex(); + ~Mutex(); + void Lock(); + void Unlock(); + void AssertHeld(); + + private: + base::Lock mu_; + + friend class CondVar; + DISALLOW_COPY_AND_ASSIGN(Mutex); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + + private: + base::ConditionVariable cv_; + + DISALLOW_COPY_AND_ASSIGN(CondVar); +}; + +class AtomicPointer { + private: + typedef base::subtle::AtomicWord Rep; + Rep rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} + inline void* Acquire_Load() const { + return reinterpret_cast(::base::subtle::Acquire_Load(&rep_)); + } + inline void Release_Store(void* v) { + ::base::subtle::Release_Store(&rep_, reinterpret_cast(v)); + } + inline void* NoBarrier_Load() const { + return reinterpret_cast(::base::subtle::NoBarrier_Load(&rep_)); + } + inline void NoBarrier_Store(void* v) { + ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast(v)); + } +}; + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + return ::base::SHA1HashBytes(reinterpret_cast(data), + len, + reinterpret_cast(hash_array)); +} + +void Lightweight_Compress(const char* input, size_t input_length, + std::string* output); +bool Lightweight_Uncompress(const char* input_data, size_t input_length, + std::string* output); + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ diff --git a/port/port_example.h b/port/port_example.h new file mode 100644 index 0000000..ee25a01 --- /dev/null +++ b/port/port_example.h @@ -0,0 +1,119 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ +#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ + +namespace leveldb { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// A type that holds a pointer that can be read or written atomically +// (i.e., without word-tearing.) +class AtomicPointer { + private: + intptr_t rep_; + public: + // Initialize to arbitrary value + AtomicPointer(); + + // Initialize to hold v + explicit AtomicPointer(void* v) : rep_(v) { } + + // Read and return the stored pointer with the guarantee that no + // later memory access (read or write) by this thread can be + // reordered ahead of this read. + void* Acquire_Load() const; + + // Set v as the stored pointer with the guarantee that no earlier + // memory access (read or write) by this thread can be reordered + // after this store. + void Release_Store(void* v); + + // Read the stored pointer with no ordering guarantees. + void* NoBarrier_Load() const; + + // Set va as the stored pointer with no ordering guarantees. + void NoBarrier_Store(void* v); +}; + +// ------------------ Checksumming ------------------- + +// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]" +extern void SHA1_Hash(const char* data, size_t len, char* hash_array); + +// ------------------ Compression ------------------- + +// Store the lightweight compression of "input[0,input_length-1]" in *output. +extern void Lightweight_Compress(const char* input, size_t input_length, + std::string* output); + +// Attempt to lightweight uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +extern bool Lightweight_Uncompress(const char* input_data, size_t input_length, + std::string* output); + +// ------------------ Miscellaneous ------------------- + +// If heap profiling is not supported, returns false. +// Else repeatedly calls (*func)(arg, data, n) and then returns true. +// The concatenation of all "data[0,n-1]" fragments is the heap profile. +extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/port/port_posix.cc b/port/port_posix.cc new file mode 100644 index 0000000..e75da8b --- /dev/null +++ b/port/port_posix.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_posix.h" + +#include +#include +#include +#include "util/logging.h" + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } + +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/port/port_posix.h b/port/port_posix.h new file mode 100644 index 0000000..e7bc5b8 --- /dev/null +++ b/port/port_posix.h @@ -0,0 +1,108 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ + +#include +#include +#include +#include +#include +#include +#include "port/sha1_portable.h" + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +// Storage for a lock-free pointer +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + SHA1_Hash_Portable(data, len, hash_array); +} + +/** + * TODO(gabor): Implement actual compress + * This is a hack - it just copies input to output. + * No actual compression occurs. + */ +inline void Lightweight_Compress(const char* input, size_t input_length, + std::string* output) { + output->assign(input, input_length); +} + +/** + * TODO(gabor): Implement actual uncompress + * This is a hack - it just copies input to output. + * No actual uncompression occurs. + */ +inline bool Lightweight_Uncompress(const char* input_data, size_t input_length, + std::string* output) { + output->assign(input_data, input_length); + return true; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc new file mode 100644 index 0000000..8fa7277 --- /dev/null +++ b/port/sha1_portable.cc @@ -0,0 +1,298 @@ +// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This module provides a slow but portable implementation of +// the SHA1 hash function. +// +// It is adapted from free code written by Paul E. Jones +// . See http://www.packetizer.com/security/sha1/ +// +// The license for the original code is: +/* + Copyright (C) 1998, 2009 + Paul E. Jones + + Freeware Public License (FPL) + + This software is licensed as "freeware." Permission to distribute + this software in source and binary forms, including incorporation + into other products, is hereby granted without a fee. THIS SOFTWARE + IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD + LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER + DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA + OR DATA BEING RENDERED INACCURATE. +*/ + +#include "port/sha1_portable.h" +#include +#include +#include + +namespace leveldb { +namespace port { + +/* + * Description: + * This class implements the Secure Hashing Standard as defined + * in FIPS PUB 180-1 published April 17, 1995. + */ + +/* + * This structure will hold context information for the hashing + * operation + */ +typedef struct SHA1Context { + unsigned Message_Digest[5]; /* Message Digest (output) */ + + unsigned Length_Low; /* Message length in bits */ + unsigned Length_High; /* Message length in bits */ + + unsigned char Message_Block[64]; /* 512-bit message blocks */ + int Message_Block_Index; /* Index into message block array */ + + bool Computed; /* Is the digest computed? */ + bool Corrupted; /* Is the message digest corruped? */ +} SHA1Context; + +/* + * Portability Issues: + * SHA-1 is defined in terms of 32-bit "words". This code was + * written with the expectation that the processor has at least + * a 32-bit machine word size. If the machine word size is larger, + * the code should still function properly. One caveat to that + * is that the input functions taking characters and character + * arrays assume that only 8 bits of information are stored in each + * character. + */ + +/* + * Define the circular shift macro + */ +#define SHA1CircularShift(bits,word) \ + ((((word) << (bits)) & 0xFFFFFFFF) | \ + ((word) >> (32-(bits)))) + +/* Function prototypes */ +static void SHA1ProcessMessageBlock(SHA1Context *); +static void SHA1PadMessage(SHA1Context *); + +// Initialize the SHA1Context in preparation for computing a new +// message digest. +static void SHA1Reset(SHA1Context* context) { + context->Length_Low = 0; + context->Length_High = 0; + context->Message_Block_Index = 0; + + context->Message_Digest[0] = 0x67452301; + context->Message_Digest[1] = 0xEFCDAB89; + context->Message_Digest[2] = 0x98BADCFE; + context->Message_Digest[3] = 0x10325476; + context->Message_Digest[4] = 0xC3D2E1F0; + + context->Computed = false; + context->Corrupted = false; +} + +// This function will return the 160-bit message digest into the +// Message_Digest array within the SHA1Context provided +static bool SHA1Result(SHA1Context *context) { + if (context->Corrupted) { + return false; + } + + if (!context->Computed) { + SHA1PadMessage(context); + context->Computed = true; + } + return true; +} + +// This function accepts an array of bytes as the next portion of +// the message. +static void SHA1Input(SHA1Context *context, + const unsigned char *message_array, + unsigned length) { + if (!length) return; + + if (context->Computed || context->Corrupted) { + context->Corrupted = true; + return; + } + + while(length-- && !context->Corrupted) { + context->Message_Block[context->Message_Block_Index++] = + (*message_array & 0xFF); + + context->Length_Low += 8; + /* Force it to 32 bits */ + context->Length_Low &= 0xFFFFFFFF; + if (context->Length_Low == 0) { + context->Length_High++; + /* Force it to 32 bits */ + context->Length_High &= 0xFFFFFFFF; + if (context->Length_High == 0) + { + /* Message is too long */ + context->Corrupted = true; + } + } + + if (context->Message_Block_Index == 64) + { + SHA1ProcessMessageBlock(context); + } + + message_array++; + } +} + +// This function will process the next 512 bits of the message stored +// in the Message_Block array. +static void SHA1ProcessMessageBlock(SHA1Context *context) { + const unsigned K[] = // Constants defined in SHA-1 + { + 0x5A827999, + 0x6ED9EBA1, + 0x8F1BBCDC, + 0xCA62C1D6 + }; + int t; // Loop counter + unsigned temp; // Temporary word value + unsigned W[80]; // Word sequence + unsigned A, B, C, D, E; // Word buffers + + // Initialize the first 16 words in the array W + for(t = 0; t < 16; t++) { + W[t] = ((unsigned) context->Message_Block[t * 4]) << 24; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]); + } + + for(t = 16; t < 80; t++) { + W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]); + } + + A = context->Message_Digest[0]; + B = context->Message_Digest[1]; + C = context->Message_Digest[2]; + D = context->Message_Digest[3]; + E = context->Message_Digest[4]; + + for(t = 0; t < 20; t++) { + temp = SHA1CircularShift(5,A) + + ((B & C) | ((~B) & D)) + E + W[t] + K[0]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 20; t < 40; t++) { + temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 40; t < 60; t++) { + temp = SHA1CircularShift(5,A) + + ((B & C) | (B & D) | (C & D)) + E + W[t] + K[2]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 60; t < 80; t++) { + temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF; + context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF; + context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF; + context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF; + context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF; + + context->Message_Block_Index = 0; +} + +// According to the standard, the message must be padded to an even +// 512 bits. The first padding bit must be a '1'. The last 64 bits +// represent the length of the original message. All bits in between +// should be 0. This function will pad the message according to those +// rules by filling the Message_Block array accordingly. It will also +// call SHA1ProcessMessageBlock() appropriately. When it returns, it +// can be assumed that the message digest has been computed. +static void SHA1PadMessage(SHA1Context *context) { + // Check to see if the current message block is too small to hold + // the initial padding bits and length. If so, we will pad the + // block, process it, and then continue padding into a second block. + if (context->Message_Block_Index > 55) { + context->Message_Block[context->Message_Block_Index++] = 0x80; + while(context->Message_Block_Index < 64) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + + SHA1ProcessMessageBlock(context); + + while(context->Message_Block_Index < 56) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + } else { + context->Message_Block[context->Message_Block_Index++] = 0x80; + while(context->Message_Block_Index < 56) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + } + + // Store the message length as the last 8 octets + context->Message_Block[56] = (context->Length_High >> 24) & 0xFF; + context->Message_Block[57] = (context->Length_High >> 16) & 0xFF; + context->Message_Block[58] = (context->Length_High >> 8) & 0xFF; + context->Message_Block[59] = (context->Length_High) & 0xFF; + context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF; + context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF; + context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF; + context->Message_Block[63] = (context->Length_Low) & 0xFF; + + SHA1ProcessMessageBlock(context); +} + + +void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) { + SHA1Context context; + SHA1Reset(&context); + SHA1Input(&context, reinterpret_cast(data), len); + bool ok = SHA1Result(&context); + if (!ok) { + fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n"); + exit(1); + } + for (int i = 0; i < 5; i++) { + uint32_t value = context.Message_Digest[i]; + hash_array[i*4 + 0] = (value >> 24) & 0xff; + hash_array[i*4 + 1] = (value >> 16) & 0xff; + hash_array[i*4 + 2] = (value >> 8) & 0xff; + hash_array[i*4 + 3] = value & 0xff; + } +} + +} +} diff --git a/port/sha1_portable.h b/port/sha1_portable.h new file mode 100644 index 0000000..31db305 --- /dev/null +++ b/port/sha1_portable.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ +#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ + +#include + +namespace leveldb { +namespace port { + +// Compute the SHA1 hash value of "data[0..len-1]" and store it in +// "hash_array[0..19]". hash_array must have 20 bytes of space available. +// +// This function is portable but may not be as fast as a version +// optimized for your platform. It is provided as a default method +// that can be used when porting leveldb to a new platform if no +// better SHA1 hash implementation is available. +void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array); + +} +} + +#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ diff --git a/port/sha1_test.cc b/port/sha1_test.cc new file mode 100644 index 0000000..46bbeba --- /dev/null +++ b/port/sha1_test.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port.h" +#include "util/testharness.h" + +namespace leveldb { +namespace port { + +class SHA1 { }; + +static std::string TestSHA1(const char* data, size_t len) { + char hash_val[20]; + SHA1_Hash(data, len, hash_val); + char buf[41]; + for (int i = 0; i < 20; i++) { + snprintf(buf + i * 2, 41 - i * 2, + "%02x", + static_cast(static_cast( + hash_val[i]))); + } + return std::string(buf, 40); +} + +TEST(SHA1, Simple) { + ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0)); + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5)); + std::string x(10000, 'x'); + ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75", + TestSHA1(x.data(), x.size())); +} + +TEST(SHA1, Benchmark) { + std::string data(1048576 * 100, 'x'); + double start = Env::Default()->NowMicros() * 1e-6; + static const int kIters = 10; + uint32_t sha1 = 0; + for (int i = 0; i < kIters; i++) { + char hash_val[20]; + SHA1_Hash(data.data(), data.size(), hash_val); + sha1 |= hash_val[0]; + } + double finish = Env::Default()->NowMicros() * 1e-6; + double mb = (static_cast(data.size()) * kIters) / 1048576.0; + fprintf(stderr, "SHA1 %0.0f MB: %.3f secs; %.1f MB/s, dummy=0x%02x\n", + mb, (finish - start), mb / (finish - start), sha1); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/table/block.cc b/table/block.cc new file mode 100644 index 0000000..351eb48 --- /dev/null +++ b/table/block.cc @@ -0,0 +1,261 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block.h" + +#include +#include +#include "include/comparator.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +inline uint32_t Block::NumRestarts() const { + assert(size_ >= 2*sizeof(uint32_t)); + return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); +} + +Block::Block(const char* data, size_t size) + : data_(data), + size_(size) { + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + } +} + +Block::~Block() { + delete[] data_; +} + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns NULL. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +static inline const char* DecodeEntry(const char* p, const char* limit, + uint32_t* shared, + uint32_t* non_shared, + uint32_t* value_length) { + if (limit - p < 3) return NULL; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; + if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; + } + + if (limit - p < (*non_shared + *value_length)) return NULL; + return p; +} + +class Block::Iter : public Iterator { + private: + const Comparator* const comparator_; + const char* const data_; // underlying block contents + uint32_t const restarts_; // Offset of restart array (list of fixed32) + uint32_t const num_restarts_; // Number of uint32_t entries in restart array + + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + uint32_t restart_index_; // Index of restart block in which current_ falls + std::string key_; + Slice value_; + Status status_; + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + return (value_.data() + value_.size()) - data_; + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + public: + Iter(const Comparator* comparator, + const char* data, + uint32_t restarts, + uint32_t num_restarts) + : comparator_(comparator), + data_(data), + restarts_(restarts), + num_restarts_(num_restarts), + current_(restarts_), + restart_index_(num_restarts_) { + assert(num_restarts_ > 0); + } + + virtual bool Valid() const { return current_ < restarts_; } + virtual Status status() const { return status_; } + virtual Slice key() const { + assert(Valid()); + return key_; + } + virtual Slice value() const { + assert(Valid()); + return value_; + } + + virtual void Next() { + assert(Valid()); + ParseNextKey(); + } + + virtual void Prev() { + assert(Valid()); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + do { + // Loop until end of current entry hits the start of original entry + } while (ParseNextKey() && NextEntryOffset() < original); + } + + virtual void Seek(const Slice& target) { + // Binary search in restart array to find the first restart point + // with a key >= target + uint32_t left = 0; + uint32_t right = num_restarts_ - 1; + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared, value_length; + const char* key_ptr = DecodeEntry(data_ + region_offset, + data_ + restarts_, + &shared, &non_shared, &value_length); + if (key_ptr == NULL || (shared != 0)) { + CorruptionError(); + return; + } + Slice mid_key(key_ptr, non_shared); + if (Compare(mid_key, target) < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } + } + + // Linear search (within restart block) for first key >= target + SeekToRestartPoint(left); + while (true) { + if (!ParseNextKey()) { + return; + } + if (Compare(key_, target) >= 0) { + return; + } + } + } + + virtual void SeekToFirst() { + SeekToRestartPoint(0); + ParseNextKey(); + } + + virtual void SeekToLast() { + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } + } + + private: + void CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.clear(); + value_.clear(); + } + + bool ParseNextKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (p == NULL || key_.size() < shared) { + CorruptionError(); + return false; + } else { + key_.resize(shared); + key_.append(p, non_shared); + value_ = Slice(p + non_shared, value_length); + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + return true; + } + } +}; + +Iterator* Block::NewIterator(const Comparator* cmp) { + if (size_ < 2*sizeof(uint32_t)) { + return NewErrorIterator(Status::Corruption("bad block contents")); + } + const uint32_t num_restarts = NumRestarts(); + if (num_restarts == 0) { + return NewEmptyIterator(); + } else { + return new Iter(cmp, data_, restart_offset_, num_restarts); + } +} + +} diff --git a/table/block.h b/table/block.h new file mode 100644 index 0000000..9372001 --- /dev/null +++ b/table/block.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ +#define STORAGE_LEVELDB_TABLE_BLOCK_H_ + +#include +#include +#include "include/iterator.h" + +namespace leveldb { + +class Comparator; + +class Block { + public: + // Initialize the block with the specified contents. + // Takes ownership of data[] and will delete[] it when done. + Block(const char* data, size_t size); + + ~Block(); + + size_t size() const { return size_; } + Iterator* NewIterator(const Comparator* comparator); + + private: + uint32_t NumRestarts() const; + + const char* data_; + size_t size_; + uint32_t restart_offset_; // Offset in data_ of restart array + + // No copying allowed + Block(const Block&); + void operator=(const Block&); + + class Iter; +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ diff --git a/table/block_builder.cc b/table/block_builder.cc new file mode 100644 index 0000000..2c33492 --- /dev/null +++ b/table/block_builder.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_builder.h" + +#include +#include +#include "include/comparator.h" +#include "include/table_builder.h" +#include "util/coding.h" + +namespace leveldb { + +BlockBuilder::BlockBuilder(const Options* options) + : options_(options), + restarts_(), + counter_(0), + finished_(false) { + assert(options->block_restart_interval >= 1); + restarts_.push_back(0); // First restart point is at offset 0 +} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + counter_ = 0; + finished_ = false; + last_key_.clear(); +} + +size_t BlockBuilder::CurrentSizeEstimate() const { + return (buffer_.size() + // Raw data buffer + restarts_.size() * sizeof(uint32_t) + // Restart array + sizeof(uint32_t)); // Restart array length +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (int i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + PutFixed32(&buffer_, restarts_.size()); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value) { + Slice last_key_piece(last_key_); + assert(!finished_); + assert(counter_ <= options_->block_restart_interval); + assert(buffer_.empty() // No values yet? + || options_->comparator->Compare(key, last_key_piece) > 0); + size_t shared = 0; + if (counter_ < options_->block_restart_interval) { + // See how much sharing to do with previous string + const size_t min_length = std::min(last_key_piece.size(), key.size()); + while ((shared < min_length) && (last_key_[shared] == key[shared])) { + shared++; + } + } else { + // Restart compression + restarts_.push_back(buffer_.size()); + counter_ = 0; + } + const size_t non_shared = key.size() - shared; + + // Add "" to buffer_ + PutVarint32(&buffer_, shared); + PutVarint32(&buffer_, non_shared); + PutVarint32(&buffer_, value.size()); + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + buffer_.append(value.data(), value.size()); + + // Update state + last_key_.resize(shared); + last_key_.append(key.data() + shared, non_shared); + assert(Slice(last_key_) == key); + counter_++; +} + +} diff --git a/table/block_builder.h b/table/block_builder.h new file mode 100644 index 0000000..beab168 --- /dev/null +++ b/table/block_builder.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ +#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ + +#include + +#include +#include "include/slice.h" + +namespace leveldb { + +struct Options; + +class BlockBuilder { + public: + explicit BlockBuilder(const Options* options); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been callled since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + size_t CurrentSizeEstimate() const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { + return buffer_.empty(); + } + + private: + const Options* options_; + std::string buffer_; // Destination buffer + std::vector restarts_; // Restart points + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + + // No copying allowed + BlockBuilder(const BlockBuilder&); + void operator=(const BlockBuilder&); +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ diff --git a/table/format.cc b/table/format.cc new file mode 100644 index 0000000..d292dad --- /dev/null +++ b/table/format.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include "include/env.h" +#include "port/port.h" +#include "table/block.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, offset_); + PutVarint64(dst, size_); +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && + GetVarint64(input, &size_)) { + return Status::OK(); + } else { + return Status::Corruption("bad block handle"); + } +} + +void Footer::EncodeTo(std::string* dst) const { +#ifndef NDEBUG + const size_t original_size = dst->size(); +#endif + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding + PutFixed32(dst, static_cast(kTableMagicNumber)); + PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); + assert(dst->size() == original_size + kEncodedLength); +} + +Status Footer::DecodeFrom(Slice* input) { + const char* magic_ptr = input->data() + kEncodedLength - 8; + const uint32_t magic_lo = DecodeFixed32(magic_ptr); + const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); + const uint64_t magic = ((static_cast(magic_hi) << 32) | + (static_cast(magic_lo))); + if (magic != kTableMagicNumber) { + return Status::InvalidArgument("not an sstable (bad magic number)"); + } + + Status result = metaindex_handle_.DecodeFrom(input); + if (result.ok()) { + result = index_handle_.DecodeFrom(input); + } + if (result.ok()) { + // We skip over any leftover data (just padding for now) in "input" + const char* end = magic_ptr + 8; + *input = Slice(end, input->data() + input->size() - end); + } + return result; +} + +Status ReadBlock(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** block) { + *block = NULL; + + // Read the block contents as well as the type/crc footer. + // See table_builder.cc for the code that built this structure. + size_t n = handle.size(); + char* buf = new char[n + kBlockTrailerSize]; + Slice contents; + Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); + if (!s.ok()) { + delete[] buf; + return s; + } + if (contents.size() != n + kBlockTrailerSize) { + delete[] buf; + return Status::Corruption("truncated block read"); + } + + // Check the crc of the type and the block contents + const char* data = contents.data(); // Pointer to where Read put the data + if (options.verify_checksums) { + const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); + const uint32_t actual = crc32c::Value(data, n + 1); + if (actual != crc) { + delete[] buf; + s = Status::Corruption("block checksum mismatch"); + return s; + } + } + + switch (data[n]) { + case kNoCompression: + if (data != buf) { + // File implementation gave us pointer to some other data. + // Copy into buf[]. + memcpy(buf, data, n + kBlockTrailerSize); + } + + // Ok + break; + case kLightweightCompression: { + std::string decompressed; + if (!port::Lightweight_Uncompress(data, n, &decompressed)) { + delete[] buf; + s = Status::Corruption("corrupted compressed block contents"); + return s; + } + delete[] buf; // Done with uncompressed data + buf = new char[decompressed.size()]; + memcpy(buf, decompressed.data(), decompressed.size()); + n = decompressed.size(); + break; + } + default: + delete[] buf; + return Status::Corruption("bad block type"); + } + + *block = new Block(buf, n); // Block takes ownership of buf[] + return Status::OK(); +} + +} diff --git a/table/format.h b/table/format.h new file mode 100644 index 0000000..03e3ee2 --- /dev/null +++ b/table/format.h @@ -0,0 +1,103 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ +#define STORAGE_LEVELDB_TABLE_FORMAT_H_ + +#include +#include +#include "include/slice.h" +#include "include/status.h" +#include "include/table_builder.h" + +namespace leveldb { + +class Block; +class RandomAccessFile; +struct ReadOptions; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + BlockHandle(); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t offset) { offset_ = offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t size) { size_ = size; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Maximum encoding length of a BlockHandle + enum { kMaxEncodedLength = 10 + 10 }; + + private: + uint64_t offset_; + uint64_t size_; +}; + +// Footer encapsulates the fixed information stored at the tail +// end of every table file. +class Footer { + public: + Footer() { } + + // The block handle for the metaindex block of the table + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } + + // The block handle for the index block of the table + const BlockHandle& index_handle() const { + return index_handle_; + } + void set_index_handle(const BlockHandle& h) { + index_handle_ = h; + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Encoded length of a Footer. Note that the serialization of a + // Footer will always occupy exactly this many bytes. It consists + // of two block handles and a magic number. + enum { + kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 + }; + + private: + BlockHandle metaindex_handle_; + BlockHandle index_handle_; +}; + +// kTableMagicNumber was picked by running +// echo http://code.google.com/p/leveldb/ | sha1sum +// and taking the leading 64 bits. +static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; + +// 1-byte type + 32-bit crc +static const size_t kBlockTrailerSize = 5; + +// Read the block identified by "handle" from "file". On success, +// store a pointer to the heap-allocated result in *block and return +// OK. On failure store NULL in *block and return non-OK. +extern Status ReadBlock(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** block); + +// Implementation details follow. Clients should ignore, + +inline BlockHandle::BlockHandle() + : offset_(~static_cast(0)), + size_(~static_cast(0)) { +} + +} + +#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ diff --git a/table/iterator.cc b/table/iterator.cc new file mode 100644 index 0000000..f3c0856 --- /dev/null +++ b/table/iterator.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/iterator.h" +#include "util/logging.h" + +namespace leveldb { + +Iterator::Iterator() { + cleanup_.function = NULL; + cleanup_.next = NULL; +} + +Iterator::~Iterator() { + if (cleanup_.function != NULL) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != NULL; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != NULL); + Cleanup* c; + if (cleanup_.function == NULL) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +} diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h new file mode 100644 index 0000000..158d3a7 --- /dev/null +++ b/table/iterator_wrapper.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ +#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ + +namespace leveldb { + +// A internal wrapper class with an interface similar to Iterator that +// caches the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +class IteratorWrapper { + private: + Iterator* iter_; + bool valid_; + Slice key_; + public: + IteratorWrapper(): iter_(NULL), valid_(false) { } + explicit IteratorWrapper(Iterator* iter): iter_(NULL) { + Set(iter); + } + ~IteratorWrapper() { delete iter_; } + Iterator* iter() const { return iter_; } + + // Takes ownership of "iter" and will delete it when destroyed, or + // when Set() is invoked again. + void Set(Iterator* iter) { + delete iter_; + iter_ = iter; + if (iter_ == NULL) { + valid_ = false; + } else { + Update(); + } + } + + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + Slice value() const { assert(Valid()); return iter_->value(); } + // Methods below require iter() != NULL + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + key_ = iter_->key(); + } + } +}; + +} + + +#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ diff --git a/table/merger.cc b/table/merger.cc new file mode 100644 index 0000000..74c1aaa --- /dev/null +++ b/table/merger.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merger.h" + +#include "include/comparator.h" +#include "include/iterator.h" +#include "table/iterator_wrapper.h" + +namespace leveldb { + +namespace { +class MergingIterator : public Iterator { + public: + MergingIterator(const Comparator* comparator, Iterator** children, int n) + : comparator_(comparator), + children_(new IteratorWrapper[n]), + n_(n), + current_(NULL) { + for (int i = 0; i < n; i++) { + children_[i].Set(children[i]); + } + } + + virtual ~MergingIterator() { + delete[] children_; + } + + virtual bool Valid() const { + return (current_ != NULL); + } + + virtual void SeekToFirst() { + for (int i = 0; i < n_; i++) { + children_[i].SeekToFirst(); + } + FindSmallest(); + } + + virtual void SeekToLast() { + for (int i = 0; i < n_; i++) { + children_[i].SeekToLast(); + } + FindLargest(); + } + + virtual void Seek(const Slice& target) { + for (int i = 0; i < n_; i++) { + children_[i].Seek(target); + } + FindSmallest(); + } + + virtual void Next() { + assert(Valid()); + current_->Next(); + FindSmallest(); + } + + virtual void Prev() { + assert(Valid()); + current_->Prev(); + FindLargest(); + } + + virtual Slice key() const { + assert(Valid()); + return current_->key(); + } + + virtual Slice value() const { + assert(Valid()); + return current_->value(); + } + + virtual Status status() const { + Status status; + for (int i = 0; i < n_; i++) { + status = children_[i].status(); + if (!status.ok()) { + break; + } + } + return status; + } + + private: + void FindSmallest(); + void FindLargest(); + + // We might want to use a heap in case there are lots of children. + // For now we use a simple array since we expect a very small number + // of children in leveldb. + const Comparator* comparator_; + IteratorWrapper* children_; + int n_; + IteratorWrapper* current_; +}; + +void MergingIterator::FindSmallest() { + IteratorWrapper* smallest = NULL; + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child->Valid()) { + if (smallest == NULL) { + smallest = child; + } else if (comparator_->Compare(child->key(), smallest->key()) < 0) { + smallest = child; + } + } + } + current_ = smallest; +} + +void MergingIterator::FindLargest() { + IteratorWrapper* largest = NULL; + for (int i = n_-1; i >= 0; i--) { + IteratorWrapper* child = &children_[i]; + if (child->Valid()) { + if (largest == NULL) { + largest = child; + } else if (comparator_->Compare(child->key(), largest->key()) > 0) { + largest = child; + } + } + } + current_ = largest; +} +} + +Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { + assert(n >= 0); + if (n == 0) { + return NewEmptyIterator(); + } else if (n == 1) { + return list[0]; + } else { + return new MergingIterator(cmp, list, n); + } +} + +} diff --git a/table/merger.h b/table/merger.h new file mode 100644 index 0000000..71d9dc5 --- /dev/null +++ b/table/merger.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ +#define STORAGE_LEVELDB_TABLE_MERGER_H_ + +namespace leveldb { + +class Comparator; +class Iterator; + +// Return an iterator that provided the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern Iterator* NewMergingIterator( + const Comparator* comparator, Iterator** children, int n); + +} + +#endif // STORAGE_LEVELDB_TABLE_MERGER_H_ diff --git a/table/table.cc b/table/table.cc new file mode 100644 index 0000000..dffc217 --- /dev/null +++ b/table/table.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/table.h" + +#include "include/cache.h" +#include "include/env.h" +#include "table/block.h" +#include "table/format.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" + +namespace leveldb { + +struct Table::Rep { + ~Rep() { + delete index_block; + } + + Options options; + Status status; + RandomAccessFile* file; + uint64_t cache_id; + + BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer + Block* index_block; +}; + +Status Table::Open(const Options& options, + RandomAccessFile* file, + Table** table) { + *table = NULL; + const uint64_t size = file->Size(); + if (size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + char footer_space[Footer::kEncodedLength]; + Slice footer_input; + Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, + &footer_input, footer_space); + if (!s.ok()) return s; + + Footer footer; + s = footer.DecodeFrom(&footer_input); + if (!s.ok()) return s; + + // Read the index block + Block* index_block = NULL; + if (s.ok()) { + s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); + } + + if (s.ok()) { + // We've successfully read the footer and the index block: we're + // ready to serve requests. + Rep* rep = new Table::Rep; + rep->options = options; + rep->file = file; + rep->metaindex_handle = footer.metaindex_handle(); + rep->index_block = index_block; + rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); + *table = new Table(rep); + } else { + if (index_block) delete index_block; + } + + return s; +} + +Table::~Table() { + delete rep_; +} + +static void DeleteBlock(void* arg, void* ignored) { + delete reinterpret_cast(arg); +} + +static void DeleteCachedBlock(const Slice& key, void* value) { + Block* block = reinterpret_cast(value); + delete block; +} + +static void ReleaseBlock(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle); +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +Iterator* Table::BlockReader(void* arg, + const ReadOptions& options, + const Slice& index_value) { + Table* table = reinterpret_cast(arg); + Cache* block_cache = table->rep_->options.block_cache; + Block* block = NULL; + Cache::Handle* cache_handle = NULL; + + BlockHandle handle; + Slice input = index_value; + Status s = handle.DecodeFrom(&input); + // We intentionally allow extra stuff in index_value so that we + // can add more features in the future. + + if (s.ok()) { + if (block_cache != NULL) { + char cache_key_buffer[16]; + EncodeFixed64(cache_key_buffer, table->rep_->cache_id); + EncodeFixed64(cache_key_buffer+8, handle.offset()); + Slice key(cache_key_buffer, sizeof(cache_key_buffer)); + cache_handle = block_cache->Lookup(key); + if (cache_handle != NULL) { + block = reinterpret_cast(block_cache->Value(cache_handle)); + } else { + s = ReadBlock(table->rep_->file, options, handle, &block); + if (s.ok() && options.fill_cache) { + cache_handle = block_cache->Insert( + key, block, block->size(), &DeleteCachedBlock); + } + } + } else { + s = ReadBlock(table->rep_->file, options, handle, &block); + } + } + + Iterator* iter; + if (block != NULL) { + iter = block->NewIterator(table->rep_->options.comparator); + if (cache_handle == NULL) { + iter->RegisterCleanup(&DeleteBlock, block, NULL); + } else { + iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); + } + } else { + iter = NewErrorIterator(s); + } + return iter; +} + +Iterator* Table::NewIterator(const ReadOptions& options) const { + return NewTwoLevelIterator( + rep_->index_block->NewIterator(rep_->options.comparator), + &Table::BlockReader, const_cast(this), options); +} + +uint64_t Table::ApproximateOffsetOf(const Slice& key) const { + Iterator* index_iter = + rep_->index_block->NewIterator(rep_->options.comparator); + index_iter->Seek(key); + uint64_t result; + if (index_iter->Valid()) { + BlockHandle handle; + Slice input = index_iter->value(); + Status s = handle.DecodeFrom(&input); + if (s.ok()) { + result = handle.offset(); + } else { + // Strange: we can't decode the block handle in the index block. + // We'll just return the offset of the metaindex block, which is + // close to the whole file size for this case. + result = rep_->metaindex_handle.offset(); + } + } else { + // key is past the last key in the file. Approximate the offset + // by returning the offset of the metaindex block (which is + // right near the end of the file). + result = rep_->metaindex_handle.offset(); + } + delete index_iter; + return result; +} + +} diff --git a/table/table_builder.cc b/table/table_builder.cc new file mode 100644 index 0000000..38ad392 --- /dev/null +++ b/table/table_builder.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/table_builder.h" + +#include +#include +#include "include/comparator.h" +#include "include/env.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/logging.h" + +namespace leveldb { + +struct TableBuilder::Rep { + Options options; + Options index_block_options; + WritableFile* file; + uint64_t offset; + Status status; + BlockBuilder data_block; + BlockBuilder index_block; + std::string last_key; + int64_t num_entries; + bool closed; // Either Finish() or Abandon() has been called. + + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + // + // Invariant: r->pending_index_entry is true only if data_block is empty. + bool pending_index_entry; + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + + Rep(const Options& opt, WritableFile* f) + : options(opt), + index_block_options(opt), + file(f), + offset(0), + data_block(&options), + index_block(&index_block_options), + num_entries(0), + closed(false), + pending_index_entry(false) { + index_block_options.block_restart_interval = 1; + } +}; + +TableBuilder::TableBuilder(const Options& options, WritableFile* file) + : rep_(new Rep(options, file)) { +} + +TableBuilder::~TableBuilder() { + assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_; +} + +Status TableBuilder::ChangeOptions(const Options& options) { + // Note: if more fields are added to Options, update + // this function to catch changes that should not be allowed to + // change in the middle of building a Table. + if (options.comparator != rep_->options.comparator) { + return Status::InvalidArgument("changing comparator while building table"); + } + + // Note that any live BlockBuilders point to rep_->options and therefore + // will automatically pick up the updated options. + rep_->options = options; + rep_->index_block_options = options; + rep_->index_block_options.block_restart_interval = 1; + return Status::OK(); +} + +void TableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->num_entries > 0) { + assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); + } + + if (r->pending_index_entry) { + assert(r->data_block.empty()); + r->options.comparator->FindShortestSeparator(&r->last_key, key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + r->pending_index_entry = false; + } + + r->last_key.assign(key.data(), key.size()); + r->num_entries++; + r->data_block.Add(key, value); + + const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); + if (estimated_block_size >= r->options.block_size) { + Flush(); + } +} + +void TableBuilder::Flush() { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->data_block.empty()) return; + assert(!r->pending_index_entry); + WriteBlock(&r->data_block, &r->pending_handle); + if (ok()) { + r->pending_index_entry = true; + r->status = r->file->Flush(); + } +} + +void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + Slice raw = block->Finish(); + + Slice block_contents; + CompressionType type = r->options.compression; + // TODO(postrelease): Support more compression options: zlib? + switch (type) { + case kNoCompression: + block_contents = raw; + break; + + case kLightweightCompression: { + port::Lightweight_Compress(raw.data(), raw.size(), &r->compressed_output); + block_contents = r->compressed_output; + if (block_contents.size() >= raw.size() - (raw.size() / 8u)) { + // Compressed less than 12.5%, so just store uncompressed form + block_contents = raw; + type = kNoCompression; + } + break; + } + } + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type + EncodeFixed32(trailer+1, crc32c::Mask(crc)); + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + } + } + r->compressed_output.clear(); + block->Reset(); +} + +Status TableBuilder::status() const { + return rep_->status; +} + +Status TableBuilder::Finish() { + Rep* r = rep_; + Flush(); + assert(!r->closed); + r->closed = true; + BlockHandle metaindex_block_handle; + BlockHandle index_block_handle; + if (ok()) { + BlockBuilder meta_index_block(&r->options); + // TODO(postrelease): Add stats and other meta blocks + WriteBlock(&meta_index_block, &metaindex_block_handle); + } + if (ok()) { + if (r->pending_index_entry) { + r->options.comparator->FindShortSuccessor(&r->last_key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + r->pending_index_entry = false; + } + WriteBlock(&r->index_block, &index_block_handle); + } + if (ok()) { + Footer footer; + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } + } + return r->status; +} + +void TableBuilder::Abandon() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; +} + +uint64_t TableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t TableBuilder::FileSize() const { + return rep_->offset; +} + +} diff --git a/table/table_test.cc b/table/table_test.cc new file mode 100644 index 0000000..f4bd7c7 --- /dev/null +++ b/table/table_test.cc @@ -0,0 +1,808 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/table.h" + +#include +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "include/db.h" +#include "include/env.h" +#include "include/iterator.h" +#include "include/table_builder.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +// Return reverse of "key". +// Used to test non-lexicographic comparators. +static std::string Reverse(const Slice& key) { + std::string str(key.ToString()); + std::string rev(str.rbegin(), str.rend()); + return rev; +} + +namespace { +class ReverseKeyComparator : public Comparator { + public: + virtual const char* Name() const { + return "leveldb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + std::string s = Reverse(*start); + std::string l = Reverse(limit); + BytewiseComparator()->FindShortestSeparator(&s, l); + *start = Reverse(s); + } + + virtual void FindShortSuccessor(std::string* key) const { + std::string s = Reverse(*key); + BytewiseComparator()->FindShortSuccessor(&s); + *key = Reverse(s); + } +}; +} +static ReverseKeyComparator reverse_key_comparator; + +static void Increment(const Comparator* cmp, std::string* key) { + if (cmp == BytewiseComparator()) { + key->push_back('\0'); + } else { + assert(cmp == &reverse_key_comparator); + std::string rev = Reverse(*key); + rev.push_back('\0'); + *key = Reverse(rev); + } +} + +// An STL comparator that uses a Comparator +namespace { +struct STLLessThan { + const Comparator* cmp; + + STLLessThan() : cmp(BytewiseComparator()) { } + STLLessThan(const Comparator* c) : cmp(c) { } + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } +}; +} + +class StringSink: public WritableFile { + public: + ~StringSink() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class StringSource: public RandomAccessFile { + public: + StringSource(const Slice& contents) + : contents_(contents.data(), contents.size()) { + } + + virtual ~StringSource() { } + + virtual uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + return Status::OK(); + } + + private: + std::string contents_; +}; + +typedef std::map KVMap; + +// Helper class for tests to unify the interface between +// BlockBuilder/TableBuilder and Block/Table. +class Constructor { + public: + explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } + virtual ~Constructor() { } + + void Add(const std::string& key, const Slice& value) { + data_[key] = value.ToString(); + } + + // Finish constructing the data structure with all the keys that have + // been added so far. Returns the keys in sorted order in "*keys" + // and stores the key/value pairs in "*kvmap" + void Finish(const Options& options, + std::vector* keys, + KVMap* kvmap) { + *kvmap = data_; + keys->clear(); + for (KVMap::const_iterator it = data_.begin(); + it != data_.end(); + ++it) { + keys->push_back(it->first); + } + data_.clear(); + Status s = FinishImpl(options, *kvmap); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + // Construct the data structure from the data in "data" + virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; + + virtual size_t NumBytes() const = 0; + + virtual Iterator* NewIterator() const = 0; + + virtual const KVMap& data() { return data_; } + + private: + KVMap data_; +}; + +class BlockConstructor: public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp), + block_size_(-1), + block_(NULL) { } + ~BlockConstructor() { + delete block_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete block_; + block_ = NULL; + BlockBuilder builder(&options); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + } + // Open the block + Slice block_data = builder.Finish(); + block_size_ = block_data.size(); + char* block_data_copy = new char[block_size_]; + memcpy(block_data_copy, block_data.data(), block_size_); + block_ = new Block(block_data_copy, block_size_); + return Status::OK(); + } + virtual size_t NumBytes() const { return block_size_; } + + virtual Iterator* NewIterator() const { + return block_->NewIterator(comparator_); + } + + private: + const Comparator* comparator_; + int block_size_; + Block* block_; + + BlockConstructor(); +}; + +class TableConstructor: public Constructor { + public: + TableConstructor(const Comparator* cmp) + : Constructor(cmp), + source_(NULL), table_(NULL) { + } + ~TableConstructor() { + Reset(); + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + Reset(); + StringSink sink; + TableBuilder builder(options, &sink); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + ASSERT_TRUE(builder.status().ok()); + } + Status s = builder.Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + + ASSERT_EQ(sink.contents().size(), builder.FileSize()); + + // Open the table + source_ = new StringSource(sink.contents()); + Options table_options; + table_options.comparator = options.comparator; + return Table::Open(table_options, source_, &table_); + } + virtual size_t NumBytes() const { return source_->Size(); } + + virtual Iterator* NewIterator() const { + return table_->NewIterator(ReadOptions()); + } + + uint64_t ApproximateOffsetOf(const Slice& key) const { + return table_->ApproximateOffsetOf(key); + } + + private: + void Reset() { + delete table_; + delete source_; + table_ = NULL; + source_ = NULL; + } + + StringSource* source_; + Table* table_; + + TableConstructor(); +}; + +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator: public Iterator { + public: + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + + virtual Slice key() const { + assert(Valid()); + ParsedInternalKey key; + if (!ParseInternalKey(iter_->key(), &key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return key.user_key; + } + + virtual Slice value() const { return iter_->value(); } + virtual Status status() const { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + Iterator* iter_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class MemTableConstructor: public Constructor { + public: + explicit MemTableConstructor(const Comparator* cmp) + : Constructor(cmp), + internal_comparator_(cmp) { + memtable_ = new MemTable(internal_comparator_); + } + ~MemTableConstructor() { + delete memtable_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete memtable_; + memtable_ = new MemTable(internal_comparator_); + int seq = 1; + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + memtable_->Add(seq, kTypeValue, it->first, it->second); + seq++; + } + return Status::OK(); + } + virtual size_t NumBytes() const { + return memtable_->ApproximateMemoryUsage(); + } + + virtual Iterator* NewIterator() const { + return new KeyConvertingIterator(memtable_->NewIterator()); + } + + private: + InternalKeyComparator internal_comparator_; + MemTable* memtable_; +}; + +class DBConstructor: public Constructor { + public: + explicit DBConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp) { + db_ = NULL; + NewDB(); + } + ~DBConstructor() { + delete db_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete db_; + db_ = NULL; + NewDB(); + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + WriteBatch batch; + batch.Put(it->first, it->second); + ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); + } + return Status::OK(); + } + virtual size_t NumBytes() const { + Range r("", "\xff\xff"); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + virtual Iterator* NewIterator() const { + return db_->NewIterator(ReadOptions()); + } + + private: + void NewDB() { + std::string name = test::TmpDir() + "/table_testdb"; + + Options options; + options.comparator = comparator_; + Status status = DestroyDB(name, options); + ASSERT_TRUE(status.ok()) << status.ToString(); + + options.create_if_missing = true; + options.error_if_exists = true; + status = DB::Open(options, name, &db_); + ASSERT_TRUE(status.ok()) << status.ToString(); + } + + const Comparator* comparator_; + DB* db_; +}; + +enum TestType { + TABLE_TEST, + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST, +}; + +struct TestArgs { + TestType type; + bool reverse_compare; + int restart_interval; +}; + +static const TestArgs kTestArgList[] = { + { TABLE_TEST, false, 16 }, + { TABLE_TEST, false, 1 }, + { TABLE_TEST, false, 1024 }, + { TABLE_TEST, true, 16 }, + { TABLE_TEST, true, 1 }, + { TABLE_TEST, true, 1024 }, + + { BLOCK_TEST, false, 16 }, + { BLOCK_TEST, false, 1 }, + { BLOCK_TEST, false, 1024 }, + { BLOCK_TEST, true, 16 }, + { BLOCK_TEST, true, 1 }, + { BLOCK_TEST, true, 1024 }, + + // Restart interval does not matter for memtables + { MEMTABLE_TEST, false, 16 }, + { MEMTABLE_TEST, true, 16 }, + + // Do not bother with restart interval variations for DB + { DB_TEST, false, 16 }, + { DB_TEST, true, 16 }, +}; +static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); + +class Harness { + public: + Harness() : constructor_(NULL) { } + + void Init(const TestArgs& args) { + delete constructor_; + constructor_ = NULL; + options_ = Options(); + + options_.block_restart_interval = args.restart_interval; + // Use shorter block size for tests to exercise block boundary + // conditions more. + options_.block_size = 256; + if (args.reverse_compare) { + options_.comparator = &reverse_key_comparator; + } + switch (args.type) { + case TABLE_TEST: + constructor_ = new TableConstructor(options_.comparator); + break; + case BLOCK_TEST: + constructor_ = new BlockConstructor(options_.comparator); + break; + case MEMTABLE_TEST: + constructor_ = new MemTableConstructor(options_.comparator); + break; + case DB_TEST: + constructor_ = new DBConstructor(options_.comparator); + break; + } + } + + ~Harness() { + delete constructor_; + } + + void Add(const std::string& key, const std::string& value) { + constructor_->Add(key, value); + } + + void Test(Random* rnd) { + std::vector keys; + KVMap data; + constructor_->Finish(options_, &keys, &data); + + TestForwardScan(keys, data); + TestBackwardScan(keys, data); + TestRandomAccess(rnd, keys, data); + } + + void TestForwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + for (KVMap::const_iterator model_iter = data.begin(); + model_iter != data.end(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestBackwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + for (KVMap::const_reverse_iterator model_iter = data.rbegin(); + model_iter != data.rend(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Prev(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestRandomAccess(Random* rnd, + const std::vector& keys, + const KVMap& data) { + static const bool kVerbose = false; + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + KVMap::const_iterator model_iter = data.begin(); + if (kVerbose) fprintf(stderr, "---\n"); + for (int i = 0; i < 200; i++) { + const int toss = rnd->Uniform(5); + switch (toss) { + case 0: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Next\n"); + iter->Next(); + ++model_iter; + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 1: { + if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + iter->SeekToFirst(); + model_iter = data.begin(); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 2: { + std::string key = PickRandomKey(rnd, keys); + model_iter = data.lower_bound(key); + if (kVerbose) fprintf(stderr, "Seek '%s'\n", + EscapeString(key).c_str()); + iter->Seek(Slice(key)); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 3: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Prev\n"); + iter->Prev(); + if (model_iter == data.begin()) { + model_iter = data.end(); // Wrap around to invalid value + } else { + --model_iter; + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 4: { + if (kVerbose) fprintf(stderr, "SeekToLast\n"); + iter->SeekToLast(); + if (keys.empty()) { + model_iter = data.end(); + } else { + std::string last = data.rbegin()->first; + model_iter = data.lower_bound(last); + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + } + } + delete iter; + } + + std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { + if (it == data.end()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const KVMap& data, + const KVMap::const_reverse_iterator& it) { + if (it == data.rend()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const Iterator* it) { + if (!it->Valid()) { + return "END"; + } else { + return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; + } + } + + std::string PickRandomKey(Random* rnd, const std::vector& keys) { + if (keys.empty()) { + return "foo"; + } else { + const int index = rnd->Uniform(keys.size()); + std::string result = keys[index]; + switch (rnd->Uniform(3)) { + case 0: + // Return an existing key + break; + case 1: { + // Attempt to return something smaller than an existing key + if (result.size() > 0 && result[result.size()-1] > '\0') { + result[result.size()-1]--; + } + break; + } + case 2: { + // Return something larger than an existing key + Increment(options_.comparator, &result); + break; + } + } + return result; + } + } + + private: + Options options_; + Constructor* constructor_; +}; + +// Test the empty key +TEST(Harness, SimpleEmptyKey) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSingle) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleMulti) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSpecialKey) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + +TEST(Harness, Randomized) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + if ((num_entries % 10) == 0) { + fprintf(stderr, "case %d of %d: num_entries = %d\n", + (i + 1), int(kNumTestArgs), num_entries); + } + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + } + } +} + +class MemTableTest { }; + +TEST(MemTableTest, Simple) { + InternalKeyComparator cmp(BytewiseComparator()); + MemTable memtable(cmp); + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + batch.Put(std::string("k1"), std::string("v1")); + batch.Put(std::string("k2"), std::string("v2")); + batch.Put(std::string("k3"), std::string("v3")); + batch.Put(std::string("largekey"), std::string("vlarge")); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); + + Iterator* iter = memtable.NewIterator(); + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", + iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } + + delete iter; +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +class TableTest { }; + +TEST(TableTest, ApproximateOffsetOfPlain) { + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kNoCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); + +} + +TEST(TableTest, ApproximateOffsetOfCompressed) { +#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM) + // Compression not supported yet, so skip this test. + // TODO(sanjay) Reenable after compression support is added + return; +#endif + + Random rnd(301); + TableConstructor c(BytewiseComparator()); + std::string tmp; + c.Add("k01", "hello"); + c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + c.Add("k03", "hello3"); + c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kLightweightCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc new file mode 100644 index 0000000..9b081f4 --- /dev/null +++ b/table/two_level_iterator.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" + +#include "include/table.h" +#include "table/block.h" +#include "table/format.h" +#include "table/iterator_wrapper.h" + +namespace leveldb { + +namespace { + +typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); + +class TwoLevelIterator: public Iterator { + public: + TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options); + + virtual ~TwoLevelIterator(); + + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + virtual void Next(); + virtual void Prev(); + + virtual bool Valid() const { + return data_iter_.Valid(); + } + virtual Slice key() const { + assert(Valid()); + return data_iter_.key(); + } + virtual Slice value() const { + assert(Valid()); + return data_iter_.value(); + } + virtual Status status() const { + // It'd be nice if status() returned a const Status& instead of a Status + if (!index_iter_.status().ok()) { + return index_iter_.status(); + } else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { + return data_iter_.status(); + } else { + return status_; + } + } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetDataIterator(Iterator* data_iter); + void InitDataBlock(); + + BlockFunction block_function_; + void* arg_; + const ReadOptions options_; + Status status_; + IteratorWrapper index_iter_; + IteratorWrapper data_iter_; // May be NULL + // If data_iter_ is non-NULL, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the data_iter_. + std::string data_block_handle_; +}; + +TwoLevelIterator::TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options) + : block_function_(block_function), + arg_(arg), + options_(options), + index_iter_(index_iter), + data_iter_(NULL) { +} + +TwoLevelIterator::~TwoLevelIterator() { +} + +void TwoLevelIterator::Seek(const Slice& target) { + index_iter_.Seek(target); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.Seek(target); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToFirst() { + index_iter_.SeekToFirst(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToLast() { + index_iter_.SeekToLast(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIterator::Next() { + assert(Valid()); + data_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::Prev() { + assert(Valid()); + data_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + + +void TwoLevelIterator::SkipEmptyDataBlocksForward() { + while (data_iter_.iter() == NULL || !data_iter_.Valid()) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + return; + } + index_iter_.Next(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); + } +} + +void TwoLevelIterator::SkipEmptyDataBlocksBackward() { + while (data_iter_.iter() == NULL || !data_iter_.Valid()) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + return; + } + index_iter_.Prev(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); + } +} + +void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { + if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); + data_iter_.Set(data_iter); +} + +void TwoLevelIterator::InitDataBlock() { + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + } else { + Slice handle = index_iter_.value(); + if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { + // data_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + Iterator* iter = (*block_function_)(arg_, options_, handle); + data_block_handle_.assign(handle.data(), handle.size()); + SetDataIterator(iter); + } + } +} + +} + +Iterator* NewTwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options) { + return new TwoLevelIterator(index_iter, block_function, arg, options); +} + +} diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h new file mode 100644 index 0000000..57e439c --- /dev/null +++ b/table/two_level_iterator.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ +#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ + +#include "include/iterator.h" + +namespace leveldb { + +struct ReadOptions; + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +extern Iterator* NewTwoLevelIterator( + Iterator* index_iter, + Iterator* (*block_function)( + void* arg, + const ReadOptions& options, + const Slice& index_value), + void* arg, + const ReadOptions& options); + +} + +#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ diff --git a/util/arena.cc b/util/arena.cc new file mode 100644 index 0000000..4bf6e36 --- /dev/null +++ b/util/arena.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include + +namespace leveldb { + +static const int kBlockSize = 4096; + +Arena::Arena() { + blocks_memory_ = 0; + alloc_ptr_ = NULL; // First allocation will allocate a block + alloc_bytes_remaining_ = 0; +} + +Arena::~Arena() { + for (int i = 0; i < blocks_.size(); i++) { + delete[] blocks_[i]; + } +} + +char* Arena::AllocateFallback(size_t bytes) { + if (bytes > kBlockSize / 4) { + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + char* result = AllocateNewBlock(bytes); + return result; + } + + // We waste the remaining space in the current block. + alloc_ptr_ = AllocateNewBlock(kBlockSize); + alloc_bytes_remaining_ = kBlockSize; + + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; +} + +char* Arena::AllocateAligned(size_t bytes) { + const int align = sizeof(void*); // We'll align to pointer size + assert((align & (align-1)) == 0); // Pointer size should be a power of 2 + size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); + size_t slop = (current_mod == 0 ? 0 : align - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = alloc_ptr_ + slop; + alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returned aligned memory + result = AllocateFallback(bytes); + } + assert((reinterpret_cast(result) & (align-1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + char* result = new char[block_bytes]; + blocks_memory_ += block_bytes; + blocks_.push_back(result); + return result; +} + +} diff --git a/util/arena.h b/util/arena.h new file mode 100644 index 0000000..fcb5d5b --- /dev/null +++ b/util/arena.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ +#define STORAGE_LEVELDB_UTIL_ARENA_H_ + +#include +#include +#include +#include + +namespace leveldb { + +class Arena { + public: + Arena(); + ~Arena(); + + // Return a pointer to a newly allocated memory block of "bytes" bytes. + char* Allocate(size_t bytes); + + // Allocate memory with the normal alignment guarantees provided by malloc + char* AllocateAligned(size_t bytes); + + // Returns an estimate of the total memory usage of data allocated + // by the arena (including space allocated but not yet used for user + // allocations). + size_t MemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*); + } + + private: + char* AllocateFallback(size_t bytes); + char* AllocateNewBlock(size_t block_bytes); + + // Allocation state + char* alloc_ptr_; + size_t alloc_bytes_remaining_; + + // Array of new[] allocated memory blocks + std::vector blocks_; + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_; + + // No copying allowed + Arena(const Arena&); + void operator=(const Arena&); +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; + } + return AllocateFallback(bytes); +} + +} + +#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/util/arena_test.cc b/util/arena_test.cc new file mode 100644 index 0000000..c33b552 --- /dev/null +++ b/util/arena_test.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" + +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { + +class ArenaTest { }; + +TEST(ArenaTest, Empty) { + Arena arena; +} + +TEST(ArenaTest, Simple) { + std::vector > allocated; + Arena arena; + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) ? rnd.Uniform(6000) : + (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena.AllocateAligned(s); + } else { + r = arena.Allocate(s); + } + + for (int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena.MemoryUsage(), bytes); + if (i > N/10) { + ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); + } + } + for (int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, i % 256); + } + } +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/cache.cc b/util/cache.cc new file mode 100644 index 0000000..958de66 --- /dev/null +++ b/util/cache.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) +#include +#elif defined(LEVELDB_PLATFORM_CHROMIUM) +#include "base/hash_tables.h" +#else +#include // TODO(sanjay): Switch to unordered_set when possible. +#endif + +#include + +#include "include/cache.h" +#include "port/port.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace leveldb { + +Cache::~Cache() { +} + +namespace { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + size_t refs; // TODO(opt): Pack with "key_length"? + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// Pick a platform specific hash_set instantiation +#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) + // Microsoft's hash_set deviates from the standard. See + // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx + // for details. Basically the 2 param () operator is a less than and + // the 1 param () operator is a hash function. + struct HandleHashCompare : public stdext::hash_compare { + size_t operator() (LRUHandle* h) const { + Slice k = h->key(); + return Hash(k.data(), k.size(), 0); + } + bool operator() (LRUHandle* a, LRUHandle* b) const { + return a->key().compare(b->key()) < 0; + } + }; + typedef base::hash_set HandleTable; +#else + struct HandleHash { + inline size_t operator()(LRUHandle* h) const { + Slice k = h->key(); + return Hash(k.data(), k.size(), 0); + } + }; + + struct HandleEq { + inline bool operator()(LRUHandle* a, LRUHandle* b) const { + return a->key() == b->key(); + } + }; +# if defined(LEVELDB_PLATFORM_CHROMIUM) + typedef base::hash_set HandleTable; +# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) + typedef std::unordered_set HandleTable; +# else + typedef __gnu_cxx::hash_set HandleTable; +# endif +#endif + +class LRUCache : public Cache { + public: + explicit LRUCache(size_t capacity); + virtual ~LRUCache(); + + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)); + virtual Handle* Lookup(const Slice& key); + virtual void Release(Handle* handle); + virtual void* Value(Handle* handle); + virtual void Erase(const Slice& key); + virtual uint64_t NewId(); + + private: + void LRU_Remove(LRUHandle* e); + void LRU_Append(LRUHandle* e); + void Unref(LRUHandle* e); + + // Constructor parameters + const size_t capacity_; + + // mutex_ protects the following state. + port::Mutex mutex_; + size_t usage_; + uint64_t last_id_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; +}; + +LRUCache::LRUCache(size_t capacity) + : capacity_(capacity), + usage_(0), + last_id_(0) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +LRUCache::~LRUCache() { + table_.clear(); + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + assert(e->refs == 1); // Error if caller has an unreleased handle + Unref(e); + e = next; + } +} + +void LRUCache::Unref(LRUHandle* e) { + assert(e->refs > 0); + e->refs--; + if (e->refs <= 0) { + usage_ -= e->charge; + (*e->deleter)(e->key(), e->value); + free(e); + } +} + +void LRUCache::LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; +} + +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; +} + +Cache::Handle* LRUCache::Lookup(const Slice& key) { + MutexLock l(&mutex_); + + LRUHandle dummy; + dummy.next = &dummy; + dummy.value = const_cast(&key); + HandleTable::iterator iter = table_.find(&dummy); + if (iter == table_.end()) { + return NULL; + } else { + LRUHandle* e = const_cast(*iter); + e->refs++; + LRU_Remove(e); + LRU_Append(e); + return reinterpret_cast(e); + } +} + +void* LRUCache::Value(Handle* handle) { + return reinterpret_cast(handle)->value; +} + +void LRUCache::Release(Handle* handle) { + MutexLock l(&mutex_); + Unref(reinterpret_cast(handle)); +} + +Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + MutexLock l(&mutex_); + + LRUHandle* e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); + LRU_Append(e); + usage_ += charge; + + std::pair p = table_.insert(e); + if (!p.second) { + // Kill existing entry + LRUHandle* old = const_cast(*(p.first)); + LRU_Remove(old); + table_.erase(p.first); + table_.insert(e); + Unref(old); + } + + while (usage_ > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + LRU_Remove(old); + table_.erase(old); + Unref(old); + } + + return reinterpret_cast(e); +} + +void LRUCache::Erase(const Slice& key) { + MutexLock l(&mutex_); + + LRUHandle dummy; + dummy.next = &dummy; + dummy.value = const_cast(&key); + HandleTable::iterator iter = table_.find(&dummy); + if (iter != table_.end()) { + LRUHandle* e = const_cast(*iter); + LRU_Remove(e); + table_.erase(iter); + Unref(e); + } +} + +uint64_t LRUCache::NewId() { + MutexLock l(&mutex_); + return ++(last_id_); +} + +} // end anonymous namespace + +Cache* NewLRUCache(size_t capacity) { + return new LRUCache(capacity); +} + +} diff --git a/util/cache_test.cc b/util/cache_test.cc new file mode 100644 index 0000000..05de5d9 --- /dev/null +++ b/util/cache_test.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/cache.h" + +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace leveldb { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest { + public: + static CacheTest* current_; + + static void Deleter(const Slice& key, void* v) { + current_->deleted_keys_.push_back(DecodeKey(key)); + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kCacheSize = 100; + std::vector deleted_keys_; + std::vector deleted_values_; + Cache* cache_; + + CacheTest() : cache_(NewLRUCache(kCacheSize)) { + current_ = this; + } + + ~CacheTest() { + delete cache_; + } + + int Lookup(int key) { + Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); + const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); + if (handle != NULL) { + cache_->Release(handle); + } + return r; + } + + void Insert(int key, int value, int charge = 1) { + cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(int key) { + cache_->Erase(EncodeKey(key)); + } +}; +CacheTest* CacheTest::current_; + +TEST(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); +} + +TEST(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + + // Frequently used entry must be kept around + for (int i = 0; i < kCacheSize; i++) { + Insert(1000+i, 2000+i); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(200, deleted_keys_[0]); + ASSERT_EQ(201, deleted_values_[0]); +} + +TEST(CacheTest, HeavyEntry) { + Insert(100, 101); + Insert(200, 201, kCacheSize); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/coding.cc b/util/coding.cc new file mode 100644 index 0000000..680e2ad --- /dev/null +++ b/util/coding.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +namespace leveldb { + +void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return reinterpret_cast(ptr); +} + +void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B-1)) | B; + v >>= 7; + } + *(ptr++) = v; + return reinterpret_cast(ptr); +} + +void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return NULL; +} + +bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return NULL; +} + +bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetLengthPrefixedSlice(const char* p, const char* limit, + Slice* result) { + uint32_t len; + p = GetVarint32Ptr(p, limit, &len); + if (p == NULL) return NULL; + if (p + len > limit) return NULL; + *result = Slice(p, len); + return p + len; +} + +bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len; + if (GetVarint32(input, &len) && + input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +} diff --git a/util/coding.h b/util/coding.h new file mode 100644 index 0000000..a42e714 --- /dev/null +++ b/util/coding.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ +#define STORAGE_LEVELDB_UTIL_CODING_H_ + +#include +#include +#include +#include "include/slice.h" +#include "port/port.h" + +namespace leveldb { + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// NULL on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(ptr[0])) + | (static_cast(ptr[1]) << 8) + | (static_cast(ptr[2]) << 16) + | (static_cast(ptr[3]) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +} + +#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/util/coding_test.cc b/util/coding_test.cc new file mode 100644 index 0000000..a8dba04 --- /dev/null +++ b/util/coding_test.cc @@ -0,0 +1,173 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "util/testharness.h" + +namespace leveldb { + +class Coding { }; + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + uint64_t actual; + actual = DecodeFixed64(p); + ASSERT_EQ(v-1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+1, actual); + p += sizeof(uint64_t); + } +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != NULL); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast(0)); + values.push_back(~static_cast(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power-1); + values.push_back(power+1); + }; + + std::string s; + for (int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != NULL); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); + +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) + == NULL); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); + } + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) + == NULL); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); + } + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/comparator.cc b/util/comparator.cc new file mode 100644 index 0000000..dca3b4d --- /dev/null +++ b/util/comparator.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "include/comparator.h" +#include "include/slice.h" +#include "util/logging.h" + +namespace leveldb { + +Comparator::~Comparator() { } + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast((*start)[diff_index]); + if (diff_byte < static_cast(0xff) && + diff_byte + 1 < static_cast(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (int i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} +static const BytewiseComparatorImpl bytewise; + +const Comparator* BytewiseComparator() { + return &bytewise; +} + +} diff --git a/util/crc32c.cc b/util/crc32c.cc new file mode 100644 index 0000000..28c2401 --- /dev/null +++ b/util/crc32c.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. + +#include "util/crc32c.h" + +#include +#include "util/coding.h" + +namespace leveldb { +namespace crc32c { + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast(p)); +} + +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint32_t l = crc ^ 0xffffffffu; + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) +#define STEP4 do { \ + uint32_t c = l ^ LE_LOAD32(p); \ + p += 4; \ + l = table3_[c & 0xff] ^ \ + table2_[(c >> 8) & 0xff] ^ \ + table1_[(c >> 16) & 0xff] ^ \ + table0_[c >> 24]; \ +} while (0) + + // Point x at first 4-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); + if (x <= e) { + // Process bytes until finished or p is 4-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + STEP4; STEP4; STEP4; STEP4; + } + // Process bytes 4 at a time + while ((e-p) >= 4) { + STEP4; + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP4 +#undef STEP1 + return l ^ 0xffffffffu; +} + +} +} diff --git a/util/crc32c.h b/util/crc32c.h new file mode 100644 index 0000000..938d8ff --- /dev/null +++ b/util/crc32c.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ +#define STORAGE_LEVELDB_UTIL_CRC32C_H_ + +#include +#include + +namespace leveldb { +namespace crc32c { + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { + return Extend(0, data, n); +} + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} +} + +#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc new file mode 100644 index 0000000..a7fc758 --- /dev/null +++ b/util/crc32c_test.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/crc32c.h" +#include "util/testharness.h" + +namespace leveldb { +namespace crc32c { + +class CRC { }; + +TEST(CRC, StandardResults) { + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = i; + } + ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = 31 - i; + } + ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); +} + +TEST(CRC, Values) { + ASSERT_NE(Value("a", 1), Value("foo", 3)); +} + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), + Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +TEST(CRC, Benchmark) { + std::string data(1048576 * 100, 'x'); + double start = Env::Default()->NowMicros() * 1e-6; + static const int kIters = 10; + uint32_t crc = 0; + for (int i = 0; i < kIters; i++) { + crc |= Value(data.data(), data.size()); + } + double finish = Env::Default()->NowMicros() * 1e-6; + double mb = (static_cast(data.size()) * kIters) / 1048576.0; + fprintf(stderr, "CRC %0.0f MB: %.3f secs; %.1f MB/s, crc=0x%08x\n", + mb, (finish - start), mb / (finish - start), crc); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/env.cc b/util/env.cc new file mode 100644 index 0000000..3c2ca89 --- /dev/null +++ b/util/env.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/env.h" + +namespace leveldb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +FileLock::~FileLock() { +} + +void Log(Env* env, WritableFile* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + env->Logv(info_log, format, ap); + va_end(ap); +} + +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + WritableFile* file; + Status s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok()) { + s = file->Close(); + } + delete file; // Will auto-close if we did not close above + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + data->clear(); + SequentialFile* file; + Status s = env->NewSequentialFile(fname, &file); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + delete file; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +} diff --git a/util/env_chromium.cc b/util/env_chromium.cc new file mode 100644 index 0000000..e39ac71 --- /dev/null +++ b/util/env_chromium.cc @@ -0,0 +1,608 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "base/at_exit.h" +#include "base/file_path.h" +#include "base/file_util.h" +#include "base/lazy_instance.h" +#include "base/message_loop.h" +#include "base/platform_file.h" +#include "base/process_util.h" +#include "base/ref_counted.h" +#include "base/synchronization/lock.h" +#include "base/sys_info.h" +#include "base/task.h" +#include "base/threading/platform_thread.h" +#include "base/threading/thread.h" +#include "base/utf_string_conversions.h" +#include "include/env.h" +#include "include/slice.h" +#include "port/port.h" +#include "util/logging.h" + +#if defined(OS_WIN) +#include +#include "base/win/win_util.h" +#endif + +#if defined(OS_MACOSX) || defined(OS_WIN) +// The following are glibc-specific +extern "C" { +size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { + return fread(ptr, size, n, file); +} + +size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) { + return fwrite(ptr, size, n, file); +} + +int fflush_unlocked(FILE *file) { + return fflush(file); +} + +int fdatasync(int fildes) { +#if defined(OS_WIN) + return _commit(fildes); +#else + return fsync(fildes); +#endif +} +} +#endif + +namespace leveldb { + +namespace { + +class Thread; + +::FilePath CreateFilePath(const std::string& file_path) { +#if defined(OS_WIN) + return FilePath(UTF8ToUTF16(file_path)); +#else + return FilePath(file_path); +#endif +} + +std::string FilePathToString(const ::FilePath& file_path) { +#if defined(OS_WIN) + return UTF16ToUTF8(file_path.value()); +#else + return file_path.value(); +#endif +} + +// TODO(jorlow): This should be moved into Chromium's base. +const char* PlatformFileErrorString(const ::base::PlatformFileError& error) { + switch (error) { + case ::base::PLATFORM_FILE_ERROR_FAILED: + return "Opening file failed."; + case ::base::PLATFORM_FILE_ERROR_IN_USE: + return "File currently in use."; + case ::base::PLATFORM_FILE_ERROR_EXISTS: + return "File already exists."; + case ::base::PLATFORM_FILE_ERROR_NOT_FOUND: + return "File not found."; + case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED: + return "Access denied."; + case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED: + return "Too many files open."; + case ::base::PLATFORM_FILE_ERROR_NO_MEMORY: + return "Out of memory."; + case ::base::PLATFORM_FILE_ERROR_NO_SPACE: + return "No space left on drive."; + case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY: + return "Not a directory."; + case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION: + return "Invalid operation."; + case ::base::PLATFORM_FILE_ERROR_SECURITY: + return "Security error."; + case ::base::PLATFORM_FILE_ERROR_ABORT: + return "File operation aborted."; + case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE: + return "The supplied path was not a file."; + case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY: + return "The file was not empty."; + } + NOTIMPLEMENTED(); + return "Unknown error."; +} + +class ChromiumSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + ChromiumSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~ChromiumSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + } + return s; + } +}; + +class ChromiumRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + uint64_t size_; + ::base::PlatformFile file_; + + public: + ChromiumRandomAccessFile(const std::string& fname, uint64_t size, + ::base::PlatformFile file) + : filename_(fname), size_(size), file_(file) { } + virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } + + virtual uint64_t Size() const { return size_; } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + int r = ::base::ReadPlatformFile(file_, offset, scratch, n); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = Status::IOError(filename_, "Could not preform read"); + } + return s; + } +}; + +class ChromiumWritableFile : public WritableFile { + private: + std::string filename_; + FILE* file_; + + public: + ChromiumWritableFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + + ~ChromiumWritableFile() { + if (file_ != NULL) { + // Ignoring any potential errors + fclose(file_); + } + } + + virtual Status Append(const Slice& data) { + size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); + Status result; + if (r != data.size()) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } + + virtual Status Close() { + Status result; + if (fclose(file_) != 0) { + result = Status::IOError(filename_, strerror(errno)); + } + file_ = NULL; + return result; + } + + virtual Status Flush() { + Status result; + if (fflush_unlocked(file_) != 0) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } + + virtual Status Sync() { + Status result; + if ((fflush_unlocked(file_) != 0) || + (fdatasync(fileno(file_)) != 0)) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } +}; + +class ChromiumFileLock : public FileLock { + public: + ::base::PlatformFile file_; +}; + +class ChromiumEnv : public Env { + public: + ChromiumEnv(); + virtual ~ChromiumEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "rb"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new ChromiumSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN; + bool created; + ::base::PlatformFileError error_code; + ::base::PlatformFile file = ::base::CreatePlatformFile( + CreateFilePath(fname), flags, &created, &error_code); + if (error_code != ::base::PLATFORM_FILE_OK) { + *result = NULL; + return Status::IOError(fname, PlatformFileErrorString(error_code)); + } + ::base::PlatformFileInfo info; + if (!::base::GetPlatformFileInfo(file, &info)) { + *result = NULL; + ::base::ClosePlatformFile(file); + return Status::IOError(fname, PlatformFileErrorString(error_code)); + } + *result = new ChromiumRandomAccessFile(fname, info.size, file); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + *result = NULL; + FILE* f = fopen(fname.c_str(), "wb"); + if (f == NULL) { + return Status::IOError(fname, strerror(errno)); + } else { + *result = new ChromiumWritableFile(fname, f); + return Status::OK(); + } + } + + virtual bool FileExists(const std::string& fname) { + return ::file_util::PathExists(CreateFilePath(fname)); + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + ::file_util::FileEnumerator iter( + CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES); + ::FilePath current = iter.Next(); + while (!current.empty()) { + result->push_back(FilePathToString(current.BaseName())); + current = iter.Next(); + } + // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so + // we'll always return OK. Maybe manually check for error + // conditions like the file not existing? + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + // TODO(jorlow): Should we assert this is a file? + if (!::file_util::Delete(CreateFilePath(fname), false)) { + result = Status::IOError(fname, "Could not delete file."); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (!::file_util::CreateDirectory(CreateFilePath(name))) { + result = Status::IOError(name, "Could not create directory."); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + // TODO(jorlow): Should we assert this is a directory? + if (!::file_util::Delete(CreateFilePath(name), false)) { + result = Status::IOError(name, "Could not delete directory."); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + int64 signed_size; + if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { + *size = 0; + s = Status::IOError(fname, "Could not determine file size."); + } else { + *size = static_cast(signed_size); + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& dst) { + Status result; + if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) { + result = Status::IOError(src, "Could not rename file."); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS | + ::base::PLATFORM_FILE_READ | + ::base::PLATFORM_FILE_WRITE | + ::base::PLATFORM_FILE_EXCLUSIVE_READ | + ::base::PLATFORM_FILE_EXCLUSIVE_WRITE; + bool created; + ::base::PlatformFileError error_code; + ::base::PlatformFile file = ::base::CreatePlatformFile( + CreateFilePath(fname), flags, &created, &error_code); + if (error_code != ::base::PLATFORM_FILE_OK) { + result = Status::IOError(fname, PlatformFileErrorString(error_code)); + } else { + ChromiumFileLock* my_lock = new ChromiumFileLock; + my_lock->file_ = file; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + ChromiumFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (!::base::ClosePlatformFile(my_lock->file_)) { + result = Status::IOError("Could not close lock file."); + } + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual std::string UserIdentifier() { +#if defined(OS_WIN) + std::wstring user_sid; + bool ret = ::base::win::GetUserSidString(&user_sid); + DCHECK(ret); + return UTF16ToUTF8(user_sid); +#else + char buf[100]; + snprintf(buf, sizeof(buf), "%d", int(geteuid())); + return buf; +#endif + } + + virtual Status GetTestDirectory(std::string* path) { + if (test_directory_.empty()) { + if (!::file_util::CreateNewTempDirectory("leveldb-", &test_directory_)) { + return Status::IOError("Could not create temp directory."); + } + } + *path = FilePathToString(test_directory_); + return Status::OK(); + } + + virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + // TODO(jorlow): We may want to just use Chromium's built in logging. + + uint64_t thread_id = 0; + // Coppied from base/logging.cc. +#if defined(OS_WIN) + thread_id = GetCurrentThreadId(); +#elif defined(OS_MACOSX) + thread_id = mach_thread_self(); +#elif defined(OS_LINUX) + thread_id = syscall(__NR_gettid); +#elif defined(OS_FREEBSD) || defined(OS_NACL) + // TODO(BSD): find a better thread ID + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); +#endif + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + ::base::Time::Exploded t; + ::base::Time::Now().LocalExplode(&t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.year, + t.month, + t.day_of_month, + t.hour, + t.minute, + t.second, + static_cast(t.millisecond) * 1000, + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + info_log->Append(Slice(base, p - base)); + info_log->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) { + ::base::Time::Exploded t; + ::base::Time::Now().LocalExplode(&t); + return snprintf(buffer, size, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d", + t.year, + t.month, + t.day_of_month, + t.hour, + t.minute, + t.second, + static_cast(t.millisecond) * 1000); + } + + virtual uint64_t NowMicros() { + return ::base::TimeTicks::HighResNow().ToInternalValue(); + } + + virtual void SleepForMicroseconds(int micros) { + // Round up to the next millisecond. + ::base::PlatformThread::Sleep((micros + 999) / 1000); + } + + private: + // BGThread() is the body of the background thread + void BGThread(); + static void BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + } + + FilePath test_directory_; + + size_t page_size_; + ::base::Lock mu_; + ::base::ConditionVariable bgsignal_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + BGQueue queue_; +}; + +ChromiumEnv::ChromiumEnv() + : page_size_(::base::SysInfo::VMAllocationGranularity()), + bgsignal_(&mu_), + started_bgthread_(false) { +#if defined(OS_MACOSX) + ::base::EnableTerminationOnHeapCorruption(); + ::base::EnableTerminationOnOutOfMemory(); +#endif // OS_MACOSX +} + +class Thread : public ::base::PlatformThread::Delegate { + public: + Thread(void (*function)(void* arg), void* arg) + : function_(function), arg_(arg) { + ::base::PlatformThreadHandle handle; + bool success = ::base::PlatformThread::Create(0, this, &handle); + DCHECK(success); + } + virtual ~Thread() {} + virtual void ThreadMain() { + (*function_)(arg_); + delete this; + } + + private: + void (*function_)(void* arg); + void* arg_; +}; + +void ChromiumEnv::Schedule(void (*function)(void*), void* arg) { + mu_.Acquire(); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + StartThread(&ChromiumEnv::BGThreadWrapper, this); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + bgsignal_.Signal(); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + mu_.Release(); +} + +void ChromiumEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + mu_.Acquire(); + while (queue_.empty()) { + bgsignal_.Wait(); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + mu_.Release(); + (*function)(arg); + } +} + +void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { + new Thread(function, arg); // Will self-delete. +} + +// TODO(jorlow): This won't co-exist with Chrome. Need to find a better way. +::base::AtExitManager exit_manager; + +::base::LazyInstance default_env(::base::LINKER_INITIALIZED); + +} + +Env* Env::Default() { + return default_env.Pointer(); +} + +} diff --git a/util/env_posix.cc b/util/env_posix.cc new file mode 100644 index 0000000..b662f9c --- /dev/null +++ b/util/env_posix.cc @@ -0,0 +1,609 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(LEVELDB_PLATFORM_ANDROID) +#include +#endif +#include "include/env.h" +#include "include/slice.h" +#include "port/port.h" +#include "util/logging.h" + +namespace leveldb { + +namespace { + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + } + return s; + } +}; + +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + uint64_t size_; + int fd_; + + public: + PosixRandomAccessFile(const std::string& fname, uint64_t size, int fd) + : filename_(fname), size_(size), fd_(fd) { } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual uint64_t Size() const { return size_; } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + return s; + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + + // Have we done an munmap of unsynced data? + bool pending_sync_; + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + void UnmapCurrentRegion() { + if (base_ != NULL) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + munmap(base_, limit_ - base_); + file_offset_ += limit_ - base_; + base_ = NULL; + limit_ = NULL; + last_sync_ = NULL; + dst_ = NULL; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + } + + bool MapNewRegion() { + assert(base_ == NULL); + if (ftruncate(fd_, file_offset_ + map_size_) < 0) { + return false; + } + void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return false; + } + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return true; + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(NULL), + limit_(NULL), + dst_(NULL), + last_sync_(NULL), + file_offset_(0), + pending_sync_(false) { + assert((page_size & (page_size - 1)) == 0); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + UnmapCurrentRegion(); + MapNewRegion(); + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + UnmapCurrentRegion(); + if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + fd_ = -1; + base_ = NULL; + limit_ = NULL; + return s; + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + return s; + } +}; + +static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + return fcntl(fd, F_SETLK, &f); +} + +class PosixFileLock : public FileLock { + public: + int fd_; +}; + +class PosixEnv : public Env { + public: + PosixEnv(); + virtual ~PosixEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "r"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + int fd = open(fname.c_str(), O_RDONLY); + if (fd < 0) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } + struct stat sbuf; + if (fstat(fd, &sbuf) != 0) { + *result = NULL; + Status s = Status::IOError(fname, strerror(errno)); + close(fd); + return s; + } + *result = new PosixRandomAccessFile(fname, sbuf.st_size, fd); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + *result = NULL; + s = Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixMmapFile(fname, fd, page_size_); + } + return s; + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == NULL) { + return Status::IOError(dir, strerror(errno)); + } + struct dirent* entry; + while ((entry = readdir(d)) != NULL) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = Status::IOError(fname, strerror(errno)); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = Status::IOError(name, strerror(errno)); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = Status::IOError(name, strerror(errno)); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = Status::IOError(fname, strerror(errno)); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = Status::IOError(src, strerror(errno)); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = Status::IOError(fname, strerror(errno)); + } else if (LockOrUnlock(fd, true) == -1) { + result = Status::IOError("lock " + fname, strerror(errno)); + close(fd); + } else { + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = Status::IOError(strerror(errno)); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + pthread_t tid = pthread_self(); + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, min(sizeof(thread_id), sizeof(tid))); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, NULL); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + info_log->Append(Slice(base, p - base)); + info_log->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + + private: + void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } + } + + // BGThread() is the body of the background thread + void BGThread(); + static void* BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + return NULL; + } + + size_t page_size_; + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + pthread_t bgthread_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + BGQueue queue_; +}; + +PosixEnv::PosixEnv() : page_size_(getpagesize()), + started_bgthread_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + PthreadCall( + "create thread", + pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + while (queue_.empty()) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return NULL; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, NULL, &StartThreadWrapper, state)); +} + +} + +static pthread_once_t once = PTHREAD_ONCE_INIT; +static Env* default_env; +static void InitDefaultEnv() { default_env = new PosixEnv; } + +Env* Env::Default() { + pthread_once(&once, InitDefaultEnv); + return default_env; +} + +} diff --git a/util/env_test.cc b/util/env_test.cc new file mode 100644 index 0000000..4d17564 --- /dev/null +++ b/util/env_test.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/env.h" + +#include "port/port.h" +#include "util/testharness.h" + +namespace leveldb { + +static const int kDelayMicros = 100000; + +class EnvPosixTest { + private: + port::Mutex mu_; + std::string events_; + + public: + Env* env_; + EnvPosixTest() : env_(Env::Default()) { } +}; + +static void SetBool(void* ptr) { + *(reinterpret_cast(ptr)) = true; +} + +TEST(EnvPosixTest, RunImmediately) { + bool called = false; + env_->Schedule(&SetBool, &called); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(called); +} + +TEST(EnvPosixTest, RunMany) { + int last_id = 0; + + struct CB { + int* last_id_ptr; // Pointer to shared slot + int id; // Order# for the execution of this callback + + CB(int* p, int i) : last_id_ptr(p), id(i) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + ASSERT_EQ(cb->id-1, *cb->last_id_ptr); + *cb->last_id_ptr = cb->id; + } + }; + + // Schedule in different order than start time + CB cb1(&last_id, 1); + CB cb2(&last_id, 2); + CB cb3(&last_id, 3); + CB cb4(&last_id, 4); + env_->Schedule(&CB::Run, &cb1); + env_->Schedule(&CB::Run, &cb2); + env_->Schedule(&CB::Run, &cb3); + env_->Schedule(&CB::Run, &cb4); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(4, last_id); +} + +struct State { + port::Mutex mu; + int val; + int num_running; +}; + +static void ThreadBody(void* arg) { + State* s = reinterpret_cast(arg); + s->mu.Lock(); + s->val += 1; + s->num_running -= 1; + s->mu.Unlock(); +} + +TEST(EnvPosixTest, StartThread) { + State state; + state.val = 0; + state.num_running = 3; + for (int i = 0; i < 3; i++) { + env_->StartThread(&ThreadBody, &state); + } + while (true) { + state.mu.Lock(); + int num = state.num_running; + state.mu.Unlock(); + if (num == 0) { + break; + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(state.val, 3); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/hash.cc b/util/hash.cc new file mode 100644 index 0000000..d19afd1 --- /dev/null +++ b/util/hash.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "util/coding.h" +#include "util/hash.h" + +namespace leveldb { + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // Similar to murmur hash + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = seed ^ (n * m); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + case 3: + h += data[2] << 16; + // fall through + case 2: + h += data[1] << 8; + // fall through + case 1: + h += data[0]; + h *= m; + h ^= (h >> r); + break; + } + return h; +} + + +} diff --git a/util/hash.h b/util/hash.h new file mode 100644 index 0000000..8889d56 --- /dev/null +++ b/util/hash.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Simple hash function used for internal data structures + +#ifndef STORAGE_LEVELDB_UTIL_HASH_H_ +#define STORAGE_LEVELDB_UTIL_HASH_H_ + +#include +#include + +namespace leveldb { + +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +} + +#endif // STORAGE_LEVELDB_UTIL_HASH_H_ diff --git a/util/histogram.cc b/util/histogram.cc new file mode 100644 index 0000000..c5178ef --- /dev/null +++ b/util/histogram.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "port/port.h" +#include "util/histogram.h" + +namespace leveldb { + +const double Histogram::kBucketLimit[kNumBuckets] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, + 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, + 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, + 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, + 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, + 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, + 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, + 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, + 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, + 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, + 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, + 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, + 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, + 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, + 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000, + 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0, + 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0, + 1e200, +}; + +void Histogram::Clear() { + min_ = kBucketLimit[kNumBuckets-1]; + max_ = 0; + num_ = 0; + sum_ = 0; + sum_squares_ = 0; + for (int i = 0; i < kNumBuckets; i++) { + buckets_[i] = 0; + } +} + +void Histogram::Add(double value) { + // Linear search is fast enough for our usage in db_bench + int b = 0; + while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { + b++; + } + buckets_[b] += 1.0; + if (min_ > value) min_ = value; + if (max_ < value) max_ = value; + num_++; + sum_ += value; + sum_squares_ += (value * value); +} + +double Histogram::Median() const { + return Percentile(50.0); +} + +double Histogram::Percentile(double p) const { + double threshold = num_ * (p / 100.0); + double sum = 0; + for (int b = 0; b < kNumBuckets; b++) { + sum += buckets_[b]; + if (sum >= threshold) { + // Scale linearly within this bucket + double left_point = (b == 0) ? 0 : kBucketLimit[b-1]; + double right_point = kBucketLimit[b]; + double left_sum = sum - buckets_[b]; + double right_sum = sum; + double pos = (threshold - left_sum) / (right_sum - left_sum); + double r = left_point + (right_point - left_point) * pos; + if (r < min_) r = min_; + if (r > max_) r = max_; + return r; + } + } + return max_; +} + +double Histogram::Average() const { + if (num_ == 0.0) return 0; + return sum_ / num_; +} + +double Histogram::StandardDeviation() const { + if (num_ == 0.0) return 0; + double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); + return sqrt(variance); +} + +std::string Histogram::ToString() const { + std::string r; + char buf[200]; + snprintf(buf, sizeof(buf), + "Count: %.0f Average: %.4f StdDev: %.2f\n", + num_, Average(), StandardDeviation()); + r.append(buf); + snprintf(buf, sizeof(buf), + "Min: %.4f Median: %.4f Max: %.4f\n", + (num_ == 0.0 ? 0.0 : min_), Median(), max_); + r.append(buf); + r.append("------------------------------------------------------\n"); + const double mult = 100.0 / num_; + double sum = 0; + for (int b = 0; b < kNumBuckets; b++) { + if (buckets_[b] <= 0.0) continue; + sum += buckets_[b]; + snprintf(buf, sizeof(buf), + "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ", + ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left + kBucketLimit[b], // right + buckets_[b], // count + mult * buckets_[b], // percentage + mult * sum); // cumulative percentage + r.append(buf); + + // Add hash marks based on percentage; 20 marks for 100%. + int marks = static_cast(20*(buckets_[b] / num_) + 0.5); + r.append(marks, '#'); + r.push_back('\n'); + } + return r; +} + +} diff --git a/util/histogram.h b/util/histogram.h new file mode 100644 index 0000000..f72f122 --- /dev/null +++ b/util/histogram.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ +#define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ + +#include + +namespace leveldb { + +class Histogram { + public: + Histogram() { } + ~Histogram() { } + + void Clear(); + void Add(double value); + + std::string ToString() const; + + private: + double min_; + double max_; + double num_; + double sum_; + double sum_squares_; + + enum { kNumBuckets = 154 }; + static const double kBucketLimit[kNumBuckets]; + double buckets_[kNumBuckets]; + + double Median() const; + double Percentile(double p) const; + double Average() const; + double StandardDeviation() const; +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ diff --git a/util/logging.cc b/util/logging.cc new file mode 100644 index 0000000..6b7c410 --- /dev/null +++ b/util/logging.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#include +#include +#include +#include +#include "include/env.h" +#include "include/slice.h" + +namespace leveldb { + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (int i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} diff --git a/util/logging.h b/util/logging.h new file mode 100644 index 0000000..1cd0a4b --- /dev/null +++ b/util/logging.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ +#define STORAGE_LEVELDB_UTIL_LOGGING_H_ + +#include +#include +#include +#include "port/port.h" + +namespace leveldb { + +class Slice; +class WritableFile; + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} + +#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/util/mutexlock.h b/util/mutexlock.h new file mode 100644 index 0000000..05fe279 --- /dev/null +++ b/util/mutexlock.h @@ -0,0 +1,39 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ +#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ + +#include "port/port.h" + +namespace leveldb { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { + this->mu_->Lock(); + } + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; + // No copying allowed + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +} + + +#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ diff --git a/util/options.cc b/util/options.cc new file mode 100644 index 0000000..b792bb1 --- /dev/null +++ b/util/options.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "include/options.h" + +#include "include/comparator.h" +#include "include/env.h" + +namespace leveldb { + +Options::Options() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(NULL), + write_buffer_size(1<<20), + max_open_files(1000), + large_value_threshold(65536), + block_cache(NULL), + block_size(8192), + block_restart_interval(16), + compression(kLightweightCompression) { +} + + +} diff --git a/util/random.h b/util/random.h new file mode 100644 index 0000000..2d458e8 --- /dev/null +++ b/util/random.h @@ -0,0 +1,59 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ +#define STORAGE_LEVELDB_UTIL_RANDOM_H_ + +#include + +namespace leveldb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = (product >> 31) + (product & M); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/util/status.cc b/util/status.cc new file mode 100644 index 0000000..2ed799d --- /dev/null +++ b/util/status.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "port/port.h" +#include "include/status.h" + +namespace leveldb { + +Status::Status(Code code, const Slice& msg, const Slice& msg2) { + assert(code != kOk); + state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); + if (!msg2.empty()) { + state_->second.append(": "); + state_->second.append(msg2.data(), msg2.size()); + } +} + +std::string Status::ToString() const { + if (state_ == NULL) { + return "OK"; + } else { + char tmp[30]; + const char* type; + switch (state_->first) { + case kOk: + type = "OK"; + break; + case kNotFound: + type = "NotFound"; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(state_->first)); + type = tmp; + break; + } + std::string result(type); + if (!state_->second.empty()) { + result.append(state_->second); + } + return result; + } +} + +} diff --git a/util/testharness.cc b/util/testharness.cc new file mode 100644 index 0000000..b686ac3 --- /dev/null +++ b/util/testharness.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testharness.h" + +#include +#include + +namespace leveldb { +namespace test { + +namespace { +struct Test { + const char* base; + const char* name; + void (*func)(); +}; +std::vector* tests; +} + +bool RegisterTest(const char* base, const char* name, void (*func)()) { + if (tests == NULL) { + tests = new std::vector; + } + Test t; + t.base = base; + t.name = name; + t.func = func; + tests->push_back(t); + return true; +} + +int RunAllTests() { + int num = 0; + if (tests != NULL) { + for (int i = 0; i < tests->size(); i++) { + const Test& t = (*tests)[i]; + fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); + (*t.func)(); + ++num; + } + } + fprintf(stderr, "==== PASSED %d tests\n", num); + return 0; +} + +std::string TmpDir() { + std::string dir; + Status s = Env::Default()->GetTestDirectory(&dir); + ASSERT_TRUE(s.ok()) << s.ToString(); + return dir; +} + +int RandomSeed() { + const char* env = getenv("TEST_RANDOM_SEED"); + int result = (env != NULL ? atoi(env) : 301); + if (result <= 0) { + result = 301; + } + return result; +} + +} +} diff --git a/util/testharness.h b/util/testharness.h new file mode 100644 index 0000000..93309dc --- /dev/null +++ b/util/testharness.h @@ -0,0 +1,129 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ +#define STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ + +#include +#include +#include +#include "include/env.h" +#include "include/slice.h" +#include "util/random.h" + +namespace leveldb { +namespace test { + +// Run all tests registered by the TEST() macro. +// Returns 0 if all tests pass. +// Dies or returns a non-zero value if some test fails. +extern int RunAllTests(); + +// Return the directory to use for temporary storage. +extern std::string TmpDir(); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +extern int RandomSeed(); + +// An instance of Tester is allocated to hold temporary state during +// the execution of an assertion. +class Tester { + private: + bool ok_; + const char* fname_; + int line_; + std::stringstream ss_; + + public: + Tester(const char* f, int l) + : ok_(true), fname_(f), line_(l) { + } + + ~Tester() { + if (!ok_) { + fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + exit(1); + } + } + + Tester& Is(bool b, const char* msg) { + if (!b) { + ss_ << " Assertion failure " << msg; + ok_ = false; + } + return *this; + } + + Tester& IsOk(const Status& s) { + if (!s.ok()) { + ss_ << " " << s.ToString(); + ok_ = false; + } + return *this; + } + +#define BINARY_OP(name,op) \ + template \ + Tester& name(const X& x, const Y& y) { \ + if (! (x op y)) { \ + ss_ << " failed: " << x << (" " #op " ") << y; \ + ok_ = false; \ + } \ + return *this; \ + } + + BINARY_OP(IsEq, ==) + BINARY_OP(IsNe, !=) + BINARY_OP(IsGe, >=) + BINARY_OP(IsGt, >) + BINARY_OP(IsLe, <=) + BINARY_OP(IsLt, <) +#undef BINARY_OP + + // Attach the specified value to the error message if an error has occurred + template + Tester& operator<<(const V& value) { + if (!ok_) { + ss_ << " " << value; + } + return *this; + } +}; + +#define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c) +#define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) +#define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) +#define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) +#define ASSERT_GT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) +#define ASSERT_LE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) +#define ASSERT_LT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) + +#define TCONCAT(a,b) TCONCAT1(a,b) +#define TCONCAT1(a,b) a##b + +#define TEST(base,name) \ +class TCONCAT(_Test_,name) : public base { \ + public: \ + void _Run(); \ + static void _RunIt() { \ + TCONCAT(_Test_,name) t; \ + t._Run(); \ + } \ +}; \ +bool TCONCAT(_Test_ignored_,name) = \ + ::leveldb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ +void TCONCAT(_Test_,name)::_Run() + +// Register the specified test. Typically not used directly, but +// invoked via the macro expansion of TEST. +extern bool RegisterTest(const char* base, const char* name, void (*func)()); + + +} +} + +#endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ diff --git a/util/testutil.cc b/util/testutil.cc new file mode 100644 index 0000000..8d6cf3c --- /dev/null +++ b/util/testutil.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testutil.h" + +#include "util/random.h" + +namespace leveldb { +namespace test { + +Slice RandomString(Random* rnd, int len, std::string* dst) { + dst->resize(len); + for (int i = 0; i < len; i++) { + (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' + } + return Slice(*dst); +} + +std::string RandomKey(Random* rnd, int len) { + // Make sure to generate a wide variety of characters so we + // test the boundary conditions for short-key optimizations. + static const char kTestChars[] = { + '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' + }; + std::string result; + for (int i = 0; i < len; i++) { + result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; + } + return result; +} + + +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data; + RandomString(rnd, raw, &raw_data); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + +} +} diff --git a/util/testutil.h b/util/testutil.h new file mode 100644 index 0000000..0e8a177 --- /dev/null +++ b/util/testutil.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_ +#define STORAGE_LEVELDB_UTIL_TESTUTIL_H_ + +#include "include/env.h" +#include "include/slice.h" +#include "util/random.h" + +namespace leveldb { +namespace test { + +// Store in *dst a random string of length "len" and return a Slice that +// references the generated data. +extern Slice RandomString(Random* rnd, int len, std::string* dst); + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +extern std::string RandomKey(Random* rnd, int len); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + ErrorEnv() : EnvWrapper(Env::Default()), + writable_file_error_(false), + num_writable_file_errors_(0) { } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + if (writable_file_error_) { + ++num_writable_file_errors_; + *result = NULL; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result); + } +}; + +} +} + +#endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_ -- cgit v1.2.3 From 22cc5453bada623b7e525e71d9c4268dabb16613 Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Fri, 18 Mar 2011 22:47:28 +0000 Subject: Directory paths were off a bit. git-svn-id: http://leveldb.googlecode.com/svn/trunk@3 62dab493-f737-651d-591e-8d6aee1b9529 --- leveldb.gyp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/leveldb.gyp b/leveldb.gyp index 81dd523..bd43042 100644 --- a/leveldb.gyp +++ b/leveldb.gyp @@ -11,9 +11,7 @@ 'LEVELDB_PLATFORM_CHROMIUM=1', ], 'include_dirs': [ - # MOE:begin_strip - '../..', - # MOE:end_strip_and_replace '.', + '.', ], 'conditions': [ ['OS == "win"', { @@ -35,9 +33,7 @@ 'dependencies': [ # The base libary is a lightweight abstraction layer for things like # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ - # MOE:begin_strip - '../../../../base/base.gyp:base', - # MOE:end_strip_and_replace '../../base/base.gyp:base', + '../../base/base.gyp:base', ], 'conditions': [ ['use_snappy', { @@ -138,16 +134,12 @@ 'target_name': 'leveldb_testutil', 'type': '<(library)', 'dependencies': [ - # MOE:begin_strip - '../../../../base/base.gyp:base', - # MOE:end_strip_and_replace '../../base/base.gyp:base', + '../../base/base.gyp:base', 'leveldb', ], 'export_dependent_settings': [ # The tests use include directories from these projects. - # MOE:begin_strip - '../../../../base/base.gyp:base', - # MOE:end_strip_and_replace '../../base/base.gyp:base', + '../../base/base.gyp:base', 'leveldb', ], 'sources': [ -- cgit v1.2.3 From 828515533f6cfe28ab1ae39d1cd8b915052f919f Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Fri, 18 Mar 2011 23:03:49 +0000 Subject: Fix typo in Makefile. git-svn-id: http://leveldb.googlecode.com/svn/trunk@4 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a60b4de..8fbcddf 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ LIBOBJECTS = \ ./db/db_impl.o \ ./db/db_iter.o \ ./db/filename.o \ - ./db/format.o \ + ./db/dbformat.o \ ./db/log_reader.o \ ./db/log_writer.o \ ./db/memtable.o \ -- cgit v1.2.3 -- cgit v1.2.3 -- cgit v1.2.3 From c50fe91cf36ed8e0281b9d3d338dfd80508d5a03 Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Mon, 21 Mar 2011 18:13:39 +0000 Subject: Oops, another file that didn't upload correctly. git-svn-id: http://leveldb.googlecode.com/svn/trunk@7 62dab493-f737-651d-591e-8d6aee1b9529 --- port/win/stdint.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 port/win/stdint.h diff --git a/port/win/stdint.h b/port/win/stdint.h new file mode 100644 index 0000000..39edd0d --- /dev/null +++ b/port/win/stdint.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// MSVC didn't ship with this file until the 2010 version. + +#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ +#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ + +#if !defined(_MSC_VER) +#error This file should only be included when compiling with MSVC. +#endif + +// Define C99 equivalent types. +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long long int64_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ -- cgit v1.2.3 From 1a1b4538c3a33cadb3096e755416664af209270d Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Mon, 21 Mar 2011 19:09:55 +0000 Subject: Changes to get Snappy working git-svn-id: http://leveldb.googlecode.com/svn/trunk@8 62dab493-f737-651d-591e-8d6aee1b9529 --- leveldb.gyp | 4 ++-- port/port_chromium.cc | 10 ++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/leveldb.gyp b/leveldb.gyp index bd43042..934f2d0 100644 --- a/leveldb.gyp +++ b/leveldb.gyp @@ -4,7 +4,7 @@ { 'variables': { - 'use_snappy%': 0, + 'use_snappy%': 1, }, 'target_defaults': { 'defines': [ @@ -38,7 +38,7 @@ 'conditions': [ ['use_snappy', { 'dependencies': [ - '../../../../third_party/snappy/snappy.gyp:snappy', + '../../third_party/snappy/snappy.gyp:snappy', ], }], ], diff --git a/port/port_chromium.cc b/port/port_chromium.cc index c022ec4..e25c1b7 100644 --- a/port/port_chromium.cc +++ b/port/port_chromium.cc @@ -8,7 +8,7 @@ #if defined(USE_SNAPPY) # include "third_party/snappy/src/snappy.h" -# include "third_party/snappy/src/snappy-stubs.h" +# include "snappy-stubs-public.h" #endif namespace leveldb { @@ -55,8 +55,7 @@ void Lightweight_Compress(const char* input, size_t input_length, #if defined(USE_SNAPPY) output->resize(snappy::MaxCompressedLength(input_length)); size_t outlen; - snappy::RawCompress(snappy::StringPiece(input, input_length), - &(*output)[0], &outlen); + snappy::RawCompress(input, input_length, &(*output)[0], &outlen); output->resize(outlen); #else output->assign(input, input_length); @@ -66,13 +65,12 @@ void Lightweight_Compress(const char* input, size_t input_length, bool Lightweight_Uncompress(const char* input_data, size_t input_length, std::string* output) { #if defined(USE_SNAPPY) - snappy::StringPiece input(input_data, input_length); size_t ulength; - if (!snappy::GetUncompressedLength(input, &ulength)) { + if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { return false; } output->resize(ulength); - return snappy::RawUncompress(input, &(*output)[0]); + return snappy::RawUncompress(input_data, input_length, &(*output)[0]); #else output->assign(input_data, input_length); return true; -- cgit v1.2.3 From 84744ee8e3e568ca5b24eabf31be706d69d80c4d Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Mon, 21 Mar 2011 19:40:57 +0000 Subject: Sync in bug fixes git-svn-id: http://leveldb.googlecode.com/svn/trunk@9 62dab493-f737-651d-591e-8d6aee1b9529 --- db/corruption_test.cc | 33 +++++++++++++++------------- db/db_bench.cc | 60 +++++++++++++++++++++++++++++++++++---------------- db/log_reader.cc | 14 ++++++++---- db/log_test.cc | 2 +- table/merger.cc | 56 ++++++++++++++++++++++++++++++++++++++++++++++- table/table_test.cc | 28 ++++++++++++++++++++++++ util/env_posix.cc | 2 +- 7 files changed, 155 insertions(+), 40 deletions(-) diff --git a/db/corruption_test.cc b/db/corruption_test.cc index a59ab0e..1f4f26c 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -13,6 +13,7 @@ #include "include/write_batch.h" #include "db/db_impl.h" #include "db/filename.h" +#include "db/log_format.h" #include "db/version_set.h" #include "util/logging.h" #include "util/testharness.h" @@ -128,17 +129,17 @@ class CorruptionTest { std::string fname = candidates[rnd_.Uniform(candidates.size())]; struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - const char* msg = strerror(errno); - ASSERT_TRUE(false) << fname << ": " << msg; - } - - if (offset < 0) { - // Relative to end of file; make it absolute - if (-offset > sbuf.st_size) { - offset = 0; - } else { - offset = sbuf.st_size + offset; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + ASSERT_TRUE(false) << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = sbuf.st_size + offset; } } if (offset > sbuf.st_size) { @@ -183,12 +184,14 @@ class CorruptionTest { }; TEST(CorruptionTest, Recovery) { - Build(10); - Check(10, 10); + Build(100); + Check(100, 100); Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record - Corrupt(kLogFile, 2*kValueSize, 1); // Somewhere in second log record? + Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block Reopen(); - Check(8, 8); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); } TEST(CorruptionTest, RecoverWriteError) { diff --git a/db/db_bench.cc b/db/db_bench.cc index 4ccdd5a..db8deea 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -35,9 +35,11 @@ static const char* FLAGS_benchmarks = "writerandom," "sync,tenth,tenth,writerandom,nosync,normal," "readseq," + "readreverse," "readrandom," "compact," "readseq," + "readreverse," "readrandom," "writebig"; @@ -167,7 +169,7 @@ class Benchmark { message_.append(rate); } - fprintf(stdout, "%-12s : %10.3f micros/op;%s%s\n", + fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", name.ToString().c_str(), (finish - start_) * 1e6 / done_, (message_.empty() ? "" : " "), @@ -179,7 +181,11 @@ class Benchmark { } public: - enum Order { SEQUENTIAL, RANDOM }; + enum Order { + SEQUENTIAL, + REVERSE, // Currently only supported for reads + RANDOM + }; Benchmark() : cache_(NewLRUCache(200<<20)), db_(NULL), @@ -239,6 +245,8 @@ class Benchmark { Write(RANDOM, num_ / 1000, 100 * 1000); } else if (name == Slice("readseq")) { Read(SEQUENTIAL); + } else if (name == Slice("readreverse")) { + Read(REVERSE); } else if (name == Slice("readrandom")) { Read(RANDOM); } else if (name == Slice("compact")) { @@ -284,23 +292,39 @@ class Benchmark { void Read(Order order) { ReadOptions options; - if (order == SEQUENTIAL) { - Iterator* iter = db_->NewIterator(options); - int i = 0; - for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; + switch (order) { + case SEQUENTIAL: { + Iterator* iter = db_->NewIterator(options); + int i = 0; + for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + break; } - delete iter; - } else { - std::string value; - for (int i = 0; i < num_; i++) { - char key[100]; - const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); - snprintf(key, sizeof(key), "%012d", k); - db_->Get(options, key, &value); - FinishedSingleOp(); + case REVERSE: { + Iterator* iter = db_->NewIterator(options); + int i = 0; + for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + break; + } + case RANDOM: { + std::string value; + for (int i = 0; i < num_; i++) { + char key[100]; + const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); + snprintf(key, sizeof(key), "%012d", k); + db_->Get(options, key, &value); + FinishedSingleOp(); + } + break; } } } diff --git a/db/log_reader.cc b/db/log_reader.cc index 243bd2c..39a6d2b 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -148,16 +148,22 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { // Check crc if (checksum_) { if (type == kZeroType && length == 0) { - // Skip zero length record - buffer_.remove_prefix(kHeaderSize + length); + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + buffer_.clear(); return kBadRecord; } uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); if (actual_crc != expected_crc) { - ReportDrop(length, "checksum mismatch"); - buffer_.remove_prefix(kHeaderSize + length); + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + ReportDrop(buffer_.size(), "checksum mismatch"); + buffer_.clear(); return kBadRecord; } } diff --git a/db/log_test.cc b/db/log_test.cc index 8c1915d..5fa20aa 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -286,7 +286,7 @@ TEST(LogTest, ChecksumMismatch) { Write("foo"); IncrementByte(0, 10); ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ(10, DroppedBytes()); ASSERT_EQ("OK", MatchError("checksum mismatch")); } diff --git a/table/merger.cc b/table/merger.cc index 74c1aaa..afa8b77 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -17,7 +17,8 @@ class MergingIterator : public Iterator { : comparator_(comparator), children_(new IteratorWrapper[n]), n_(n), - current_(NULL) { + current_(NULL), + direction_(kForward) { for (int i = 0; i < n; i++) { children_[i].Set(children[i]); } @@ -36,6 +37,7 @@ class MergingIterator : public Iterator { children_[i].SeekToFirst(); } FindSmallest(); + direction_ = kForward; } virtual void SeekToLast() { @@ -43,6 +45,7 @@ class MergingIterator : public Iterator { children_[i].SeekToLast(); } FindLargest(); + direction_ = kReverse; } virtual void Seek(const Slice& target) { @@ -50,16 +53,60 @@ class MergingIterator : public Iterator { children_[i].Seek(target); } FindSmallest(); + direction_ = kForward; } virtual void Next() { assert(Valid()); + + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all of the non-current_ children since current_ is + // the smallest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kForward) { + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child != current_) { + child->Seek(key()); + if (child->Valid() && + comparator_->Compare(key(), child->key()) == 0) { + child->Next(); + } + } + } + direction_ = kForward; + } + current_->Next(); FindSmallest(); } virtual void Prev() { assert(Valid()); + + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all of the non-current_ children since current_ is + // the largest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kReverse) { + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child != current_) { + child->Seek(key()); + if (child->Valid()) { + // Child is at first entry >= key(). Step back one to be < key() + child->Prev(); + } else { + // Child has no entries >= key(). Position at last entry. + child->SeekToLast(); + } + } + } + direction_ = kReverse; + } + current_->Prev(); FindLargest(); } @@ -96,6 +143,13 @@ class MergingIterator : public Iterator { IteratorWrapper* children_; int n_; IteratorWrapper* current_; + + // Which direction is the iterator moving? + enum Direction { + kForward, + kReverse + }; + Direction direction_; }; void MergingIterator::FindSmallest() { diff --git a/table/table_test.cc b/table/table_test.cc index f4bd7c7..d997454 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -169,6 +169,8 @@ class Constructor { virtual const KVMap& data() { return data_; } + virtual DB* db() const { return NULL; } // Overridden in DBConstructor + private: KVMap data_; }; @@ -381,6 +383,8 @@ class DBConstructor: public Constructor { return db_->NewIterator(ReadOptions()); } + virtual DB* db() const { return db_; } + private: void NewDB() { std::string name = test::TmpDir() + "/table_testdb"; @@ -392,6 +396,7 @@ class DBConstructor: public Constructor { options.create_if_missing = true; options.error_if_exists = true; + options.write_buffer_size = 10000; // Something small to force merging status = DB::Open(options, name, &db_); ASSERT_TRUE(status.ok()) << status.ToString(); } @@ -640,6 +645,9 @@ class Harness { } } + // Returns NULL if not running against a DB + DB* db() const { return constructor_->db(); } + private: Options options_; Constructor* constructor_; @@ -704,6 +712,26 @@ TEST(Harness, Randomized) { } } +TEST(Harness, RandomizedLongDB) { + Random rnd(test::RandomSeed()); + TestArgs args = { DB_TEST, false, 16 }; + Init(args); + int num_entries = 100000; + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + + // We must have created enough data to force merging + uint64_t l0_files, l1_files; + ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); + ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); + ASSERT_GT(l0_files + l1_files, 0); + +} + class MemTableTest { }; TEST(MemTableTest, Simple) { diff --git a/util/env_posix.cc b/util/env_posix.cc index b662f9c..f5174d3 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -422,7 +422,7 @@ class PosixEnv : public Env { virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { pthread_t tid = pthread_self(); uint64_t thread_id = 0; - memcpy(&thread_id, &tid, min(sizeof(thread_id), sizeof(tid))); + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); // We try twice: the first time with a fixed-size stack allocated buffer, // and the second time with a much larger dynamically allocated buffer. -- cgit v1.2.3 From 87c528b30a91689078728c5c7c03772c7ce0201b Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Mon, 21 Mar 2011 21:06:49 +0000 Subject: more upstream changes git-svn-id: http://leveldb.googlecode.com/svn/trunk@10 62dab493-f737-651d-591e-8d6aee1b9529 --- port/port_chromium.cc | 1 - util/env_chromium.cc | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/port/port_chromium.cc b/port/port_chromium.cc index e25c1b7..4026aa3 100644 --- a/port/port_chromium.cc +++ b/port/port_chromium.cc @@ -8,7 +8,6 @@ #if defined(USE_SNAPPY) # include "third_party/snappy/src/snappy.h" -# include "snappy-stubs-public.h" #endif namespace leveldb { diff --git a/util/env_chromium.cc b/util/env_chromium.cc index e39ac71..49666f6 100644 --- a/util/env_chromium.cc +++ b/util/env_chromium.cc @@ -594,10 +594,8 @@ void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { new Thread(function, arg); // Will self-delete. } -// TODO(jorlow): This won't co-exist with Chrome. Need to find a better way. -::base::AtExitManager exit_manager; - -::base::LazyInstance default_env(::base::LINKER_INITIALIZED); +::base::LazyInstance > + default_env(::base::LINKER_INITIALIZED); } -- cgit v1.2.3 From 8e30874b0cc959ac51e32e2f42387eadc5375b3c Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Mon, 21 Mar 2011 23:10:11 +0000 Subject: Removing unneeded build files git-svn-id: http://leveldb.googlecode.com/svn/trunk@11 62dab493-f737-651d-591e-8d6aee1b9529 --- Android.mk | 64 ---------------------------------------------------------- Application.mk | 6 ------ 2 files changed, 70 deletions(-) delete mode 100644 Android.mk delete mode 100644 Application.mk diff --git a/Android.mk b/Android.mk deleted file mode 100644 index fa4a3de..0000000 --- a/Android.mk +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -# INSTRUCTIONS -# After you've downloaded and installed the Android NDK from: -# http://developer.android.com/sdk/ndk/index.html -# 1. In the same directory as this file, Android.mk, type: -# $ ln -s leveldb ../jni -# (The Android NDK will only build native projects in -# subdirectories named "jni".) -# 2. $ cd .. -# 3. Execute ndk-build: -# $ $(ANDROID_NDK_DIR)/ndk-build - -LOCAL_PATH := $(call my-dir) - -include $(CLEAR_VARS) -LOCAL_MODULE := leveldb -# Build flags: -# - LEVELDB_PLATFORM_ANDROID to use the correct port header: port_android.h -LOCAL_CFLAGS := -DLEVELDB_PLATFORM_ANDROID -std=gnu++0x -LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../ -LOCAL_CPP_EXTENSION := .cc - -LOCAL_SRC_FILES := ./db/builder.cc \ -./db/db_bench.cc \ -./db/db_impl.cc \ -./db/db_iter.cc \ -./db/filename.cc \ -./db/dbformat.cc \ -./db/log_reader.cc \ -./db/log_writer.cc \ -./db/memtable.cc \ -./db/repair.cc \ -./db/table_cache.cc \ -./db/version_edit.cc \ -./db/version_set.cc \ -./db/write_batch.cc \ -./port/port_android.cc \ -./table/block.cc \ -./table/block_builder.cc \ -./table/format.cc \ -./table/iterator.cc \ -./table/merger.cc \ -./table/table.cc \ -./table/table_builder.cc \ -./table/two_level_iterator.cc \ -./util/arena.cc \ -./util/cache.cc \ -./util/coding.cc \ -./util/comparator.cc \ -./util/crc32c.cc \ -./util/env.cc \ -./util/env_posix.cc \ -./util/hash.cc \ -./util/histogram.cc \ -./util/logging.cc \ -./util/options.cc \ -./util/status.cc \ -./util/testharness.cc \ -./util/testutil.cc - -include $(BUILD_SHARED_LIBRARY) diff --git a/Application.mk b/Application.mk deleted file mode 100644 index 9360a38..0000000 --- a/Application.mk +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -APP_ABI := armeabi-v7a -APP_STL := gnustl_static -- cgit v1.2.3 From b887f640bae906abfb77fdf418be63350b4c5e1f Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Tue, 22 Mar 2011 18:32:49 +0000 Subject: More changes from upstream. git-svn-id: http://leveldb.googlecode.com/svn/trunk@12 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_bench.cc | 264 ++++++++++++++++++++++++++++++++++++------------------ db/db_impl.cc | 20 ++++- db/db_impl.h | 4 + db/db_test.cc | 74 ++++++++++++++- db/version_set.cc | 122 +++++++++++++++++++------ db/version_set.h | 25 ++++++ doc/impl.html | 10 ++- include/db.h | 3 + leveldb.gyp | 2 +- 9 files changed, 405 insertions(+), 119 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index db8deea..72e0699 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -17,11 +17,14 @@ // Comma-separated list of operations to run in the specified order // Actual benchmarks: -// writeseq -- write N values in sequential key order -// writerandom -- write N values in random key order -// writebig -- write N/1000 100K valuesin random order -// readseq -- read N values sequentially -// readrandom -- read N values in random order +// fillseq -- write N values in sequential key order in async mode +// fillrandom -- write N values in random key order in async mode +// overwrite -- overwrite N values in random key order in async mode +// fillsync -- write N/100 values in random key order in sync mode +// fill100K -- write N/1000 100K values in random order in async mode +// readseq -- read N values sequentially +// readreverse -- read N values in reverse order +// readrandom -- read N values in random order // Meta operations: // compact -- Compact the entire DB // heapprofile -- Dump a heap profile (if supported by this port) @@ -30,10 +33,10 @@ // tenth -- divide N by 10 (i.e., following benchmarks are smaller) // normal -- reset N back to its normal value (1000000) static const char* FLAGS_benchmarks = - "writeseq," - "writeseq," - "writerandom," - "sync,tenth,tenth,writerandom,nosync,normal," + "fillseq," + "fillrandom," + "overwrite," + "fillsync," "readseq," "readreverse," "readrandom," @@ -41,7 +44,7 @@ static const char* FLAGS_benchmarks = "readseq," "readreverse," "readrandom," - "writebig"; + "fill100K"; // Number of key/values to place in database static int FLAGS_num = 1000000; @@ -51,7 +54,7 @@ static int FLAGS_value_size = 100; // Arrange to generate values that shrink to this fraction of // their original size after compression -static double FLAGS_compression_ratio = 0.25; +static double FLAGS_compression_ratio = 0.5; // Print histogram of operation timings static bool FLAGS_histogram = false; @@ -93,6 +96,19 @@ class RandomGenerator { return Slice(data_.data() + pos_ - len, len); } }; + +static Slice TrimSpace(Slice s) { + int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); +} + } class Benchmark { @@ -100,7 +116,6 @@ class Benchmark { Cache* cache_; DB* db_; int num_; - bool sync_; int heap_counter_; double start_; double last_op_finish_; @@ -114,6 +129,70 @@ class Benchmark { int done_; int next_report_; // When to report next + void PrintHeader() { + const int kKeySize = 16; + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", kKeySize); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %d\n", num_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + (((kKeySize + FLAGS_value_size) * num_) / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) + / 1048576.0)); + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + } + + void PrintEnvironment() { + fprintf(stderr, "LevelDB: version %d.%d\n", + kMajorVersion, kMinorVersion); + +#if defined(__linux) + time_t now = time(NULL); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != NULL) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != NULL) { + const char* sep = strchr(line, ':'); + if (sep == NULL) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + void Start() { start_ = Env::Default()->NowMicros() * 1e-6; bytes_ = 0; @@ -164,9 +243,10 @@ class Benchmark { snprintf(rate, sizeof(rate), "%5.1f MB/s", (bytes_ / 1048576.0) / (finish - start_)); if (!message_.empty()) { - message_.push_back(' '); + message_ = std::string(rate) + " " + message_; + } else { + message_ = rate; } - message_.append(rate); } fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", @@ -183,14 +263,16 @@ class Benchmark { public: enum Order { SEQUENTIAL, - REVERSE, // Currently only supported for reads RANDOM }; + enum DBState { + FRESH, + EXISTING + }; Benchmark() : cache_(NewLRUCache(200<<20)), db_(NULL), num_(FLAGS_num), - sync_(false), heap_counter_(0), bytes_(0), rand_(301) { @@ -210,19 +292,8 @@ class Benchmark { } void Run() { - Options options; - options.create_if_missing = true; - options.max_open_files = 10000; - options.block_cache = cache_; - options.write_buffer_size = FLAGS_write_buffer_size; - - Start(); - Status s = DB::Open(options, "/tmp/dbbench", &db_); - Stop("open"); - if (!s.ok()) { - fprintf(stderr, "open error: %s\n", s.ToString().c_str()); - exit(1); - } + PrintHeader(); + Open(); const char* benchmarks = FLAGS_benchmarks; while (benchmarks != NULL) { @@ -237,30 +308,30 @@ class Benchmark { } Start(); - if (name == Slice("writeseq")) { - Write(SEQUENTIAL, num_, FLAGS_value_size); - } else if (name == Slice("writerandom")) { - Write(RANDOM, num_, FLAGS_value_size); - } else if (name == Slice("writebig")) { - Write(RANDOM, num_ / 1000, 100 * 1000); + + WriteOptions write_options; + write_options.sync = false; + if (name == Slice("fillseq")) { + Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size); + } else if (name == Slice("fillrandom")) { + Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size); + } else if (name == Slice("overwrite")) { + Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size); + } else if (name == Slice("fillsync")) { + write_options.sync = true; + Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size); + } else if (name == Slice("fill100K")) { + Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000); } else if (name == Slice("readseq")) { - Read(SEQUENTIAL); + ReadSequential(); } else if (name == Slice("readreverse")) { - Read(REVERSE); + ReadReverse(); } else if (name == Slice("readrandom")) { - Read(RANDOM); + ReadRandom(); } else if (name == Slice("compact")) { Compact(); } else if (name == Slice("heapprofile")) { HeapProfile(); - } else if (name == Slice("sync")) { - sync_ = true; - } else if (name == Slice("nosync")) { - sync_ = false; - } else if (name == Slice("tenth")) { - num_ = num_ / 10; - } else if (name == Slice("normal")) { - num_ = FLAGS_num; } else { fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); } @@ -268,16 +339,44 @@ class Benchmark { } } - void Write(Order order, int num_entries, int value_size) { + private: + void Open() { + assert(db_ == NULL); + Options options; + options.create_if_missing = true; + options.max_open_files = 10000; + options.block_cache = cache_; + options.write_buffer_size = FLAGS_write_buffer_size; + Status s = DB::Open(options, "/tmp/dbbench", &db_); + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void Write(const WriteOptions& options, Order order, DBState state, + int num_entries, int value_size) { + if (state == FRESH) { + delete db_; + db_ = NULL; + DestroyDB("/tmp/dbbench", Options()); + Open(); + Start(); // Do not count time taken to destroy/open + } + + if (num_entries != num_) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%d ops)", num_entries); + message_ = msg; + } + WriteBatch batch; Status s; std::string val; - WriteOptions options; - options.sync = sync_; for (int i = 0; i < num_entries; i++) { const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); char key[100]; - snprintf(key, sizeof(key), "%012d", k); + snprintf(key, sizeof(key), "%016d", k); batch.Clear(); batch.Put(key, gen_.Generate(value_size)); s = db_->Write(options, &batch); @@ -290,42 +389,37 @@ class Benchmark { } } - void Read(Order order) { + void ReadSequential() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadReverse() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadRandom() { ReadOptions options; - switch (order) { - case SEQUENTIAL: { - Iterator* iter = db_->NewIterator(options); - int i = 0; - for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - break; - } - case REVERSE: { - Iterator* iter = db_->NewIterator(options); - int i = 0; - for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - break; - } - case RANDOM: { - std::string value; - for (int i = 0; i < num_; i++) { - char key[100]; - const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num); - snprintf(key, sizeof(key), "%012d", k); - db_->Get(options, key, &value); - FinishedSingleOp(); - } - break; - } + std::string value; + for (int i = 0; i < num_; i++) { + char key[100]; + const int k = rand_.Next() % FLAGS_num; + snprintf(key, sizeof(key), "%016d", k); + db_->Get(options, key, &value); + FinishedSingleOp(); } } diff --git a/db/db_impl.cc b/db/db_impl.cc index 5008af0..4d66044 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -532,8 +532,9 @@ void DBImpl::BackgroundCompaction() { } Status status; - if (c->num_input_files(0) == 1 && c->num_input_files(1) == 0) { + if (c->IsTrivialMove()) { // Move file to next level + assert(c->num_input_files(0) == 1); FileMetaData* f = c->input(0, 0); c->edit()->DeleteFile(c->level(), f->number); c->edit()->AddFile(c->level() + 1, f->number, f->file_size, @@ -718,8 +719,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { bool has_current_user_key = false; SequenceNumber last_sequence_for_key = kMaxSequenceNumber; for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Handle key/value, add to state, etc. Slice key = input->key(); + InternalKey tmp_internal_key; + tmp_internal_key.DecodeFrom(key); + if (compact->compaction->ShouldStopBefore(tmp_internal_key) && + compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. bool drop = false; if (!ParseInternalKey(key, &ikey)) { // Do not hide error keys @@ -855,6 +866,11 @@ Iterator* DBImpl::TEST_NewInternalIterator() { return NewInternalIterator(ReadOptions(), &ignored); } +int64 DBImpl::TEST_MaxNextLevelOverlappingBytes() { + MutexLock l(&mutex_); + return versions_->MaxNextLevelOverlappingBytes(); +} + Status DBImpl::Get(const ReadOptions& options, const Slice& key, std::string* value) { diff --git a/db/db_impl.h b/db/db_impl.h index fc3d3f2..980d512 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -55,6 +55,10 @@ class DBImpl : public DB { // The returned iterator should be deleted when no longer needed. Iterator* TEST_NewInternalIterator(); + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64 TEST_MaxNextLevelOverlappingBytes(); + private: friend class DB; diff --git a/db/db_test.cc b/db/db_test.cc index 895a5e1..f8accf6 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -72,15 +72,19 @@ class DBTest { } Status Put(const std::string& k, const std::string& v) { + WriteOptions options; + options.sync = false; WriteBatch batch; batch.Put(k, v); - return db_->Write(WriteOptions(), &batch); + return db_->Write(options, &batch); } Status Delete(const std::string& k) { + WriteOptions options; + options.sync = false; WriteBatch batch; batch.Delete(k); - return db_->Write(WriteOptions(), &batch); + return db_->Write(options, &batch); } std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { @@ -176,6 +180,35 @@ class DBTest { fprintf(stderr, "Found %d live large value files\n", (int)live.size()); return live; } + + void Compact(const Slice& start, const Slice& limit) { + dbfull()->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + uint64_t v; + char name[100]; + snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); + if (dbfull()->GetProperty(name, &v) && v > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbfull()->TEST_CompactRange(level, "", "~"); + } + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < config::kNumLevels; level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } }; TEST(DBTest, Empty) { @@ -315,6 +348,43 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { } } +TEST(DBTest, SparseMerge) { + Options options; + options.compression = kNoCompression; + Reopen(&options); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put("A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + Put("C", "vc"); + Compact("", "z"); + + // Make sparse update + Put("A", "va2"); + Put("B100", "bvalue2"); + Put("C", "vc2"); + dbfull()->TEST_CompactMemTable(); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); +} + static bool Between(uint64_t val, uint64_t low, uint64_t high) { bool result = (val >= low) && (val <= high); if (!result) { diff --git a/db/version_set.cc b/db/version_set.cc index 2435fa2..46333f4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -20,6 +20,10 @@ namespace leveldb { +// Maximum number of overlaps in grandparent (i.e., level+2) before we +// stop building a single file in a level->level+1 compaction. +static const int kMaxGrandParentFiles = 10; + static double MaxBytesForLevel(int level) { if (level == 0) { return 4 * 1048576.0; @@ -509,7 +513,7 @@ Status VersionSet::Finalize(Version* v) { double best_score = -1; Status s; - for (int level = 0; s.ok() && level < config::kNumLevels; level++) { + for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { s = SortLevel(v, level); // Compute the ratio of current size to size limit. @@ -751,6 +755,25 @@ void VersionSet::AddLiveFiles(std::set* live) { } } +int64 VersionSet::MaxNextLevelOverlappingBytes() { + int64 result = 0; + std::vector overlaps; + for (int level = 0; level < config::kNumLevels - 1; level++) { + for (int i = 0; i < current_->files_[level].size(); i++) { + const FileMetaData* f = current_->files_[level][i]; + GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); + int64 sum = 0; + for (int j = 0; j < overlaps.size(); j++) { + sum += overlaps[j]->file_size; + } + if (sum > result) { + result = sum; + } + } + } + return result; +} + // Store in "*inputs" all files in "level" that overlap [begin,end] void VersionSet::GetOverlappingInputs( int level, @@ -797,6 +820,18 @@ void VersionSet::GetRange(const std::vector& inputs, } } +// Stores the minimal range that covers all entries in inputs1 and inputs2 +// in *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + Iterator* VersionSet::MakeInputIterator(Compaction* c) { ReadOptions options; options.verify_checksums = options_->paranoid_checks; @@ -836,6 +871,7 @@ Compaction* VersionSet::PickCompaction() { } const int level = current_->compaction_level_; assert(level >= 0); + assert(level+1 < config::kNumLevels); Compaction* c = new Compaction(level); c->input_version_ = current_; @@ -855,31 +891,36 @@ Compaction* VersionSet::PickCompaction() { c->inputs_[0].push_back(current_->files_[level][0]); } - // Find the range we are compacting - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - // Files in level 0 may overlap each other, so pick up all overlapping ones if (level == 0) { + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); // Note that the next call will discard the file we placed in // c->inputs_[0] earlier and replace it with an overlapping set // which will include the picked file. GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); assert(!c->inputs_[0].empty()); - GetRange(c->inputs_[0], &smallest, &largest); } + SetupOtherInputs(c); + + return c; +} + +void VersionSet::SetupOtherInputs(Compaction* c) { + const int level = c->level(); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + // See if we can grow the number of inputs in "level" without // changing the number of "level+1" files we pick up. if (!c->inputs_[1].empty()) { - // Get entire range covered by compaction - std::vector all = c->inputs_[0]; - all.insert(all.end(), c->inputs_[1].begin(), c->inputs_[1].end()); - InternalKey all_start, all_limit; - GetRange(all, &all_start, &all_limit); - std::vector expanded0; GetOverlappingInputs(level, all_start, all_limit, &expanded0); if (expanded0.size() > c->inputs_[0].size()) { @@ -899,10 +940,17 @@ Compaction* VersionSet::PickCompaction() { largest = new_limit; c->inputs_[0] = expanded0; c->inputs_[1] = expanded1; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); } } } + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < config::kNumLevels) { + GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); + } + if (false) { Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", level, @@ -916,8 +964,6 @@ Compaction* VersionSet::PickCompaction() { // key range next time. compact_pointer_[level] = largest.Encode().ToString(); c->edit_.SetCompactPointer(level, largest); - - return c; } Compaction* VersionSet::CompactRange( @@ -934,25 +980,16 @@ Compaction* VersionSet::CompactRange( c->input_version_ = current_; c->input_version_->Ref(); c->inputs_[0] = inputs; - - // Find the range we are compacting - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - - GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); - if (false) { - Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", - level, - EscapeString(smallest.Encode()).c_str(), - EscapeString(largest.Encode()).c_str()); - } + SetupOtherInputs(c); return c; } Compaction::Compaction(int level) : level_(level), max_output_file_size_(MaxFileSizeForLevel(level)), - input_version_(NULL) { + input_version_(NULL), + grandparent_index_(0), + output_start_(-1) { for (int i = 0; i < config::kNumLevels; i++) { level_ptrs_[i] = 0; } @@ -964,6 +1001,15 @@ Compaction::~Compaction() { } } +bool Compaction::IsTrivialMove() const { + // Avoid a move if there are lots of overlapping grandparent files. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + return (num_input_files(0) == 1 + && num_input_files(1) == 0 + && grandparents_.size() <= kMaxGrandParentFiles); +} + void Compaction::AddInputDeletions(VersionEdit* edit) { for (int which = 0; which < 2; which++) { for (int i = 0; i < inputs_[which].size(); i++) { @@ -993,6 +1039,28 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) { return true; } +bool Compaction::ShouldStopBefore(const InternalKey& key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { + grandparent_index_++; + } + + // First call? + if (output_start_ < 0) { + output_start_ = grandparent_index_; + } + + if (grandparent_index_ - output_start_ + 1 > kMaxGrandParentFiles) { + // Too many overlaps for current output; start new output + output_start_ = grandparent_index_; + return true; + } else { + return false; + } +} + void Compaction::ReleaseInputs() { if (input_version_ != NULL) { input_version_->Unref(); diff --git a/db/version_set.h b/db/version_set.h index b8eee3d..0ea763d 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -139,6 +139,10 @@ class VersionSet { const InternalKey& begin, const InternalKey& end); + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64 MaxNextLevelOverlappingBytes(); + // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. Iterator* MakeInputIterator(Compaction* c); @@ -195,6 +199,13 @@ class VersionSet { InternalKey* smallest, InternalKey* largest); + void GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest); + + void SetupOtherInputs(Compaction* c); + Env* const env_; const std::string dbname_; const Options* const options_; @@ -250,6 +261,10 @@ class Compaction { // Maximum size of files to build during this compaction. uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + // Add all inputs to this compaction as delete operations to *edit. void AddInputDeletions(VersionEdit* edit); @@ -258,6 +273,10 @@ class Compaction { // in levels greater than "level+1". bool IsBaseLevelForKey(const Slice& user_key); + // Returns true iff we should stop building the current output + // before processing "key". + bool ShouldStopBefore(const InternalKey& key); + // Release the input version for the compaction, once the compaction // is successful. void ReleaseInputs(); @@ -276,6 +295,12 @@ class Compaction { // Each compaction reads inputs from "level_" and "level_+1" std::vector inputs_[2]; // The two sets of inputs + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + int grandparent_index_; // Index in grandparent_starts_ + int output_start_; // Index in grandparent_starts_ where output started + // State for implementing IsBaseLevelForKey // level_ptrs_ holds indices into input_version_->levels_: our state diff --git a/doc/impl.html b/doc/impl.html index 2f2c809..b190d2c 100644 --- a/doc/impl.html +++ b/doc/impl.html @@ -123,8 +123,14 @@ one level-0 file in case some of these files overlap each other. A compaction merges the contents of the picked files to produce a sequence of level-(L+1) files. We switch to producing a new level-(L+1) file after the current output file has reached the target -file size (2MB). The old files are discarded and the new files are -added to the serving state. +file size (2MB). We also switch to a new output file when the key +range of the current output file has grown enough to overlap more then +ten level-(L+2) files. This last rule ensures that a later compaction +of a level-(L+1) file will not pick up too much data from level-(L+2). + +

+The old files are discarded and the new files are added to the serving +state.

Compactions for a particular level rotate through the key space. In diff --git a/include/db.h b/include/db.h index c4d152d..75be1ca 100644 --- a/include/db.h +++ b/include/db.h @@ -12,6 +12,9 @@ namespace leveldb { +static const int kMajorVersion = 1; +static const int kMinorVersion = 0; + struct Options; struct ReadOptions; struct WriteOptions; diff --git a/leveldb.gyp b/leveldb.gyp index 934f2d0..eb809f3 100644 --- a/leveldb.gyp +++ b/leveldb.gyp @@ -4,7 +4,7 @@ { 'variables': { - 'use_snappy%': 1, + 'use_snappy%': 0, }, 'target_defaults': { 'defines': [ -- cgit v1.2.3 From dbbc21b601732980df7a6e877e75a31e9ec1e42b Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Tue, 22 Mar 2011 19:07:54 +0000 Subject: Make GetTestDirectory threadsafe within Chromium and make it work on Windows. git-svn-id: http://leveldb.googlecode.com/svn/trunk@13 62dab493-f737-651d-591e-8d6aee1b9529 --- util/env_chromium.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/util/env_chromium.cc b/util/env_chromium.cc index 49666f6..fb700ae 100644 --- a/util/env_chromium.cc +++ b/util/env_chromium.cc @@ -60,6 +60,9 @@ namespace { class Thread; +static const ::FilePath::CharType kLevelDBTestDirectoryPrefix[] + = FILE_PATH_LITERAL("leveldb-test-"); + ::FilePath CreateFilePath(const std::string& file_path) { #if defined(OS_WIN) return FilePath(UTF8ToUTF16(file_path)); @@ -391,12 +394,16 @@ class ChromiumEnv : public Env { } virtual Status GetTestDirectory(std::string* path) { + mu_.Acquire(); if (test_directory_.empty()) { - if (!::file_util::CreateNewTempDirectory("leveldb-", &test_directory_)) { + if (!::file_util::CreateNewTempDirectory(kLevelDBTestDirectoryPrefix, + &test_directory_)) { + mu_.Release(); return Status::IOError("Could not create temp directory."); } } *path = FilePathToString(test_directory_); + mu_.Release(); return Status::OK(); } -- cgit v1.2.3 From 07f3bcfb9764be2a339cc02cf0a0d6edb151defb Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Tue, 22 Mar 2011 23:24:02 +0000 Subject: Pull from upstream. git-svn-id: http://leveldb.googlecode.com/svn/trunk@14 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_impl.cc | 10 +++++----- db/db_impl.h | 2 +- db/db_iter.cc | 6 +++--- db/db_test.cc | 17 +++++++++++------ db/dbformat.cc | 2 +- db/dbformat_test.cc | 2 +- db/filename_test.cc | 2 +- db/version_edit_test.cc | 2 +- db/version_set.cc | 51 ++++++++++++++++++++++++++++--------------------- db/version_set.h | 8 +++++--- include/options.h | 10 +++++----- port/port_android.h | 21 ++++++-------------- port/port_chromium.cc | 14 +++++++------- port/port_chromium.h | 8 ++++---- port/port_example.h | 13 +++++++------ port/port_posix.h | 25 ++++++++---------------- table/format.cc | 4 ++-- table/table_builder.cc | 13 ++++++++----- table/table_test.cc | 17 +++++++++++------ util/env_chromium.cc | 2 +- util/options.cc | 2 +- 21 files changed, 118 insertions(+), 113 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 4d66044..12c02b3 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -866,7 +866,7 @@ Iterator* DBImpl::TEST_NewInternalIterator() { return NewInternalIterator(ReadOptions(), &ignored); } -int64 DBImpl::TEST_MaxNextLevelOverlappingBytes() { +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { MutexLock l(&mutex_); return versions_->MaxNextLevelOverlappingBytes(); } @@ -989,11 +989,11 @@ void DBImpl::MaybeCompressLargeValue( std::string* scratch, LargeValueRef* ref) { switch (options_.compression) { - case kLightweightCompression: { - port::Lightweight_Compress(raw_value.data(), raw_value.size(), scratch); - if (scratch->size() < (raw_value.size() / 8) * 7) { + case kSnappyCompression: { + if (port::Snappy_Compress(raw_value.data(), raw_value.size(), scratch) && + (scratch->size() < (raw_value.size() / 8) * 7)) { *file_bytes = *scratch; - *ref = LargeValueRef::Make(raw_value, kLightweightCompression); + *ref = LargeValueRef::Make(raw_value, kSnappyCompression); return; } diff --git a/db/db_impl.h b/db/db_impl.h index 980d512..6e98e3c 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -57,7 +57,7 @@ class DBImpl : public DB { // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. - int64 TEST_MaxNextLevelOverlappingBytes(); + int64_t TEST_MaxNextLevelOverlappingBytes(); private: friend class DB; diff --git a/db/db_iter.cc b/db/db_iter.cc index c23de22..165d7d4 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -374,10 +374,10 @@ void DBIter::ReadIndirectValue() const { } break; } - case kLightweightCompression: { + case kSnappyCompression: { std::string uncompressed; - if (port::Lightweight_Uncompress(result.data(), result.size(), - &uncompressed) && + if (port::Snappy_Uncompress(result.data(), result.size(), + &uncompressed) && uncompressed.size() == large_ref.ValueSize()) { swap(uncompressed, large_->value); } else { diff --git a/db/db_test.cc b/db/db_test.cc index f8accf6..888c560 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -674,6 +674,12 @@ TEST(DBTest, LargeValues1) { ASSERT_TRUE(LargeValuesOK(this, expected)); } +static bool SnappyCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(in.data(), in.size(), &out); +} + TEST(DBTest, LargeValues2) { Options options; options.large_value_threshold = 10000; @@ -694,12 +700,11 @@ TEST(DBTest, LargeValues2) { ASSERT_OK(Put("big2", big2)); ASSERT_EQ(big2, Get("big2")); -#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM) - // TODO(sanjay) Reenable after compression support is added - expected.insert(LargeValueRef::Make(big2, kNoCompression)); -#else - expected.insert(LargeValueRef::Make(big2, kLightweightCompression)); -#endif + if (SnappyCompressionSupported()) { + expected.insert(LargeValueRef::Make(big2, kSnappyCompression)); + } else { + expected.insert(LargeValueRef::Make(big2, kNoCompression)); + } ASSERT_TRUE(LargeValuesOK(this, expected)); ASSERT_OK(dbfull()->TEST_CompactMemTable()); diff --git a/db/dbformat.cc b/db/dbformat.cc index f09a729..2664eb4 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -140,7 +140,7 @@ bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) { ConsumeChar(&in, '-') && ConsumeDecimalNumber(&in, &ctype) && in.empty() && - (ctype <= kLightweightCompression)) { + (ctype <= kSnappyCompression)) { EncodeFixed64(&h->data[20], value_size); h->data[28] = static_cast(ctype); return true; diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 5dfa101..702cbb4 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -117,7 +117,7 @@ TEST(FormatTest, SHA1) { LargeValueRef::Make("hello", kNoCompression))); ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr LargeValueRefToFilenameString( - LargeValueRef::Make("hello", kLightweightCompression))); + LargeValueRef::Make("hello", kSnappyCompression))); } } diff --git a/db/filename_test.cc b/db/filename_test.cc index 08a54eb..4d2a91e 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -136,7 +136,7 @@ TEST(FileNameTest, Construction) { ASSERT_EQ(999, number); ASSERT_EQ(kTempFile, type); - for (int i = 0; i <= kLightweightCompression; i++) { + for (int i = 0; i <= kSnappyCompression; i++) { CompressionType ctype = static_cast(i); std::string value = "abcdef"; LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype); diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 50913cd..6906ec3 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -31,7 +31,7 @@ TEST(VersionEditTest, EncodeDecode) { edit.DeleteFile(4, kBig + 700 + i); edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression), kBig + 800 + i, "foobar"); - edit.AddLargeValueRef(LargeValueRef::Make("big2", kLightweightCompression), + edit.AddLargeValueRef(LargeValueRef::Make("big2", kSnappyCompression), kBig + 801 + i, "baz"); edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); } diff --git a/db/version_set.cc b/db/version_set.cc index 46333f4..caf0b2d 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -20,9 +20,11 @@ namespace leveldb { -// Maximum number of overlaps in grandparent (i.e., level+2) before we +static const int kTargetFileSize = 2 * 1048576; + +// Maximum bytes of overlaps in grandparent (i.e., level+2) before we // stop building a single file in a level->level+1 compaction. -static const int kMaxGrandParentFiles = 10; +static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; static double MaxBytesForLevel(int level) { if (level == 0) { @@ -38,7 +40,7 @@ static double MaxBytesForLevel(int level) { } static uint64_t MaxFileSizeForLevel(int level) { - return 2 << 20; // We could vary per level to reduce number of files? + return kTargetFileSize; // We could vary per level to reduce number of files? } namespace { @@ -755,17 +757,22 @@ void VersionSet::AddLiveFiles(std::set* live) { } } -int64 VersionSet::MaxNextLevelOverlappingBytes() { - int64 result = 0; +static int64_t TotalFileSize(const std::vector& files) { + int64_t sum = 0; + for (int i = 0; i < files.size(); i++) { + sum += files[i]->file_size; + } + return sum; +} + +int64_t VersionSet::MaxNextLevelOverlappingBytes() { + int64_t result = 0; std::vector overlaps; for (int level = 0; level < config::kNumLevels - 1; level++) { for (int i = 0; i < current_->files_[level].size(); i++) { const FileMetaData* f = current_->files_[level][i]; GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); - int64 sum = 0; - for (int j = 0; j < overlaps.size(); j++) { - sum += overlaps[j]->file_size; - } + const int64_t sum = TotalFileSize(overlaps); if (sum > result) { result = sum; } @@ -989,7 +996,8 @@ Compaction::Compaction(int level) max_output_file_size_(MaxFileSizeForLevel(level)), input_version_(NULL), grandparent_index_(0), - output_start_(-1) { + seen_key_(false), + overlapped_bytes_(0) { for (int i = 0; i < config::kNumLevels; i++) { level_ptrs_[i] = 0; } @@ -1002,12 +1010,12 @@ Compaction::~Compaction() { } bool Compaction::IsTrivialMove() const { - // Avoid a move if there are lots of overlapping grandparent files. + // Avoid a move if there is lots of overlapping grandparent data. // Otherwise, the move could create a parent file that will require // a very expensive merge later on. - return (num_input_files(0) == 1 - && num_input_files(1) == 0 - && grandparents_.size() <= kMaxGrandParentFiles); + return (num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes); } void Compaction::AddInputDeletions(VersionEdit* edit) { @@ -1044,17 +1052,16 @@ bool Compaction::ShouldStopBefore(const InternalKey& key) { const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; while (grandparent_index_ < grandparents_.size() && icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } grandparent_index_++; } + seen_key_ = true; - // First call? - if (output_start_ < 0) { - output_start_ = grandparent_index_; - } - - if (grandparent_index_ - output_start_ + 1 > kMaxGrandParentFiles) { - // Too many overlaps for current output; start new output - output_start_ = grandparent_index_; + if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; return true; } else { return false; diff --git a/db/version_set.h b/db/version_set.h index 0ea763d..a4199be 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -141,7 +141,7 @@ class VersionSet { // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. - int64 MaxNextLevelOverlappingBytes(); + int64_t MaxNextLevelOverlappingBytes(); // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. @@ -298,8 +298,10 @@ class Compaction { // State used to check for number of of overlapping grandparent files // (parent == level_ + 1, grandparent == level_ + 2) std::vector grandparents_; - int grandparent_index_; // Index in grandparent_starts_ - int output_start_; // Index in grandparent_starts_ where output started + int grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + int64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files // State for implementing IsBaseLevelForKey diff --git a/include/options.h b/include/options.h index 1105570..0b65624 100644 --- a/include/options.h +++ b/include/options.h @@ -22,8 +22,8 @@ class WritableFile; enum CompressionType { // NOTE: do not change the values of existing entries, as these are // part of the persistent format on disk. - kNoCompression = 0x0, - kLightweightCompression = 0x1, + kNoCompression = 0x0, + kSnappyCompression = 0x1, }; // Options to control the behavior of a database (passed to DB::Open) @@ -122,16 +122,16 @@ struct Options { // Compress blocks using the specified compression algorithm. This // parameter can be changed dynamically. // - // Default: kLightweightCompression, which gives lightweight but fast + // Default: kSnappyCompression, which gives lightweight but fast // compression. // - // Typical speeds of kLightweightCompression on an Intel(R) Core(TM)2 2.4GHz: + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: // ~200-500MB/s compression // ~400-800MB/s decompression // Note that these speeds are significantly faster than most // persistent storage speeds, and therefore it is typically never // worth switching to kNoCompression. Even if the input data is - // incompressible, the kLightweightCompression implementation will + // incompressible, the kSnappyCompression implementation will // efficiently detect that and will switch to uncompressed mode. CompressionType compression; diff --git a/port/port_android.h b/port/port_android.h index 2770a0c..ca0362d 100644 --- a/port/port_android.h +++ b/port/port_android.h @@ -82,29 +82,20 @@ class AtomicPointer { } }; -/** - * TODO(gabor): Implement actual compress - * This is a hack - it just copies input to output. - * No actual compression occurs. - */ -inline void Lightweight_Compress( +// TODO(gabor): Implement actual compress +inline bool Snappy_Compress( const char* input, size_t input_length, std::string* output) { - output->copy((char*)input,0,input_length); + return false; } -/** - * TODO(gabor): Implement actual compress - * This is a hack - it just copies input to output. - * No actual uncompression occurs. - */ -inline bool Lightweight_Uncompress( +// TODO(gabor): Implement actual uncompress +inline bool Snappy_Uncompress( const char* input_data, size_t input_length, std::string* output) { - output->copy((char*)input_data,0,input_length); - return (bool)1; + return false; } inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { diff --git a/port/port_chromium.cc b/port/port_chromium.cc index 4026aa3..2ab49b9 100644 --- a/port/port_chromium.cc +++ b/port/port_chromium.cc @@ -49,20 +49,21 @@ void CondVar::SignalAll() { cv_.Broadcast(); } -void Lightweight_Compress(const char* input, size_t input_length, - std::string* output) { +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { #if defined(USE_SNAPPY) output->resize(snappy::MaxCompressedLength(input_length)); size_t outlen; snappy::RawCompress(input, input_length, &(*output)[0], &outlen); output->resize(outlen); + return true; #else - output->assign(input, input_length); + return false; #endif } -bool Lightweight_Uncompress(const char* input_data, size_t input_length, - std::string* output) { +bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { #if defined(USE_SNAPPY) size_t ulength; if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { @@ -71,8 +72,7 @@ bool Lightweight_Uncompress(const char* input_data, size_t input_length, output->resize(ulength); return snappy::RawUncompress(input_data, input_length, &(*output)[0]); #else - output->assign(input_data, input_length); - return true; + return false; #endif } diff --git a/port/port_chromium.h b/port/port_chromium.h index b33bdde..e349f8f 100644 --- a/port/port_chromium.h +++ b/port/port_chromium.h @@ -89,10 +89,10 @@ inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { reinterpret_cast(hash_array)); } -void Lightweight_Compress(const char* input, size_t input_length, - std::string* output); -bool Lightweight_Uncompress(const char* input_data, size_t input_length, - std::string* output); +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); +bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output); inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; diff --git a/port/port_example.h b/port/port_example.h index ee25a01..cf72617 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -96,15 +96,16 @@ extern void SHA1_Hash(const char* data, size_t len, char* hash_array); // ------------------ Compression ------------------- -// Store the lightweight compression of "input[0,input_length-1]" in *output. -extern void Lightweight_Compress(const char* input, size_t input_length, - std::string* output); +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); -// Attempt to lightweight uncompress input[0,input_length-1] into *output. +// Attempt to snappy uncompress input[0,input_length-1] into *output. // Returns true if successful, false if the input is invalid lightweight // compressed data. -extern bool Lightweight_Uncompress(const char* input_data, size_t input_length, - std::string* output); +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output); // ------------------ Miscellaneous ------------------- diff --git a/port/port_posix.h b/port/port_posix.h index e7bc5b8..7adbc01 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -77,25 +77,16 @@ inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { SHA1_Hash_Portable(data, len, hash_array); } -/** - * TODO(gabor): Implement actual compress - * This is a hack - it just copies input to output. - * No actual compression occurs. - */ -inline void Lightweight_Compress(const char* input, size_t input_length, - std::string* output) { - output->assign(input, input_length); +// TODO(gabor): Implement actual compress +inline bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { + return false; } -/** - * TODO(gabor): Implement actual uncompress - * This is a hack - it just copies input to output. - * No actual uncompression occurs. - */ -inline bool Lightweight_Uncompress(const char* input_data, size_t input_length, - std::string* output) { - output->assign(input_data, input_length); - return true; +// TODO(gabor): Implement actual uncompress +inline bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { + return false; } inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { diff --git a/table/format.cc b/table/format.cc index d292dad..191a9bd 100644 --- a/table/format.cc +++ b/table/format.cc @@ -106,9 +106,9 @@ Status ReadBlock(RandomAccessFile* file, // Ok break; - case kLightweightCompression: { + case kSnappyCompression: { std::string decompressed; - if (!port::Lightweight_Uncompress(data, n, &decompressed)) { + if (!port::Snappy_Uncompress(data, n, &decompressed)) { delete[] buf; s = Status::Corruption("corrupted compressed block contents"); return s; diff --git a/table/table_builder.cc b/table/table_builder.cc index 38ad392..3bc1b88 100644 --- a/table/table_builder.cc +++ b/table/table_builder.cc @@ -138,11 +138,14 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { block_contents = raw; break; - case kLightweightCompression: { - port::Lightweight_Compress(raw.data(), raw.size(), &r->compressed_output); - block_contents = r->compressed_output; - if (block_contents.size() >= raw.size() - (raw.size() / 8u)) { - // Compressed less than 12.5%, so just store uncompressed form + case kSnappyCompression: { + std::string* compressed = &r->compressed_output; + if (port::Snappy_Compress(raw.data(), raw.size(), compressed) && + compressed->size() < raw.size() - (raw.size() / 8u)) { + block_contents = *compressed; + } else { + // Snappy not supported, or compressed less than 12.5%, so just + // store uncompressed form block_contents = raw; type = kNoCompression; } diff --git a/table/table_test.cc b/table/table_test.cc index d997454..eabb257 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -800,12 +800,17 @@ TEST(TableTest, ApproximateOffsetOfPlain) { } +static bool SnappyCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(in.data(), in.size(), &out); +} + TEST(TableTest, ApproximateOffsetOfCompressed) { -#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM) - // Compression not supported yet, so skip this test. - // TODO(sanjay) Reenable after compression support is added - return; -#endif + if (!SnappyCompressionSupported()) { + fprintf(stderr, "skipping compression tests\n"); + return; + } Random rnd(301); TableConstructor c(BytewiseComparator()); @@ -818,7 +823,7 @@ TEST(TableTest, ApproximateOffsetOfCompressed) { KVMap kvmap; Options options; options.block_size = 1024; - options.compression = kLightweightCompression; + options.compression = kSnappyCompression; c.Finish(options, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); diff --git a/util/env_chromium.cc b/util/env_chromium.cc index fb700ae..d79406f 100644 --- a/util/env_chromium.cc +++ b/util/env_chromium.cc @@ -326,7 +326,7 @@ class ChromiumEnv : public Env { virtual Status GetFileSize(const std::string& fname, uint64_t* size) { Status s; - int64 signed_size; + int64_t signed_size; if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { *size = 0; s = Status::IOError(fname, "Could not determine file size."); diff --git a/util/options.cc b/util/options.cc index b792bb1..9fb7e0a 100644 --- a/util/options.cc +++ b/util/options.cc @@ -22,7 +22,7 @@ Options::Options() block_cache(NULL), block_size(8192), block_restart_interval(16), - compression(kLightweightCompression) { + compression(kSnappyCompression) { } -- cgit v1.2.3 From 225a272621a5517a1a8dec8956457a121786d7c4 Mon Sep 17 00:00:00 2001 From: "jorlow@chromium.org" Date: Fri, 25 Mar 2011 20:27:43 +0000 Subject: Upstream changes git-svn-id: http://leveldb.googlecode.com/svn/trunk@15 62dab493-f737-651d-591e-8d6aee1b9529 --- TODO | 1 - db/db_bench.cc | 55 ++++++- db/db_iter.cc | 418 +++++++++++++++++++++++++--------------------------- db/db_test.cc | 184 +++++++++++++++++++++++ doc/index.html | 3 +- port/sha1_test.cc | 16 -- util/crc32c_test.cc | 14 -- 7 files changed, 436 insertions(+), 255 deletions(-) diff --git a/TODO b/TODO index 7d60b5a..e17dfdb 100644 --- a/TODO +++ b/TODO @@ -8,7 +8,6 @@ Maybe afterwards ss - Stats -- Speed up backwards scan (avoid three passes over data) db - Maybe implement DB::BulkDeleteForRange(start_key, end_key) diff --git a/db/db_bench.cc b/db/db_bench.cc index 72e0699..7026ca1 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -11,6 +11,8 @@ #include "include/db.h" #include "include/env.h" #include "include/write_batch.h" +#include "port/port.h" +#include "util/crc32c.h" #include "util/histogram.h" #include "util/random.h" #include "util/testutil.h" @@ -25,6 +27,8 @@ // readseq -- read N values sequentially // readreverse -- read N values in reverse order // readrandom -- read N values in random order +// crc32c -- repeated crc32c of 4K of data +// sha1 -- repeated SHA1 computation over 4K of data // Meta operations: // compact -- Compact the entire DB // heapprofile -- Dump a heap profile (if supported by this port) @@ -34,17 +38,21 @@ // normal -- reset N back to its normal value (1000000) static const char* FLAGS_benchmarks = "fillseq," + "fillsync," "fillrandom," "overwrite," - "fillsync," + "readrandom," + "readrandom," // Extra run to allow previous compactions to quiesce "readseq," "readreverse," - "readrandom," "compact," + "readrandom," "readseq," "readreverse," - "readrandom," - "fill100K"; + "fill100K," + "crc32c," + "sha1" + ; // Number of key/values to place in database static int FLAGS_num = 1000000; @@ -330,6 +338,10 @@ class Benchmark { ReadRandom(); } else if (name == Slice("compact")) { Compact(); + } else if (name == Slice("crc32c")) { + Crc32c(4096, "(4K per op)"); + } else if (name == Slice("sha1")) { + SHA1(4096, "(4K per op)"); } else if (name == Slice("heapprofile")) { HeapProfile(); } else { @@ -340,6 +352,41 @@ class Benchmark { } private: + void Crc32c(int size, const char* label) { + // Checksum about 500MB of data total + string data(size, 'x'); + int64_t bytes = 0; + uint32_t crc = 0; + while (bytes < 500 * 1048576) { + crc = crc32c::Value(data.data(), size); + FinishedSingleOp(); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); + + bytes_ = bytes; + message_ = label; + } + + void SHA1(int size, const char* label) { + // SHA1 about 100MB of data total + string data(size, 'x'); + int64_t bytes = 0; + char sha1[20]; + while (bytes < 100 * 1048576) { + port::SHA1_Hash(data.data(), size, sha1); + FinishedSingleOp(); + bytes += size; + } + + // Print so result is not dead + fprintf(stderr, "... sha1=%02x...\r", static_cast(sha1[0])); + + bytes_ = bytes; + message_ = label; + } + void Open() { assert(db_ == NULL); Options options; diff --git a/db/db_iter.cc b/db/db_iter.cc index 165d7d4..6726b51 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -36,6 +36,16 @@ namespace { // numbers, deletion markers, overwrites, etc. class DBIter: public Iterator { public: + // Which direction is the iterator currently moving? + // (1) When moving forward, the internal iterator is positioned at + // the exact entry that yields this->key(), this->value() + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { + kForward, + kReverse + }; + DBIter(const std::string* dbname, Env* env, const Comparator* cmp, Iterator* iter, SequenceNumber s) : dbname_(dbname), @@ -44,6 +54,7 @@ class DBIter: public Iterator { iter_(iter), sequence_(s), large_(NULL), + direction_(kForward), valid_(false) { } virtual ~DBIter() { @@ -53,48 +64,21 @@ class DBIter: public Iterator { virtual bool Valid() const { return valid_; } virtual Slice key() const { assert(valid_); - return key_; + return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_; } virtual Slice value() const { assert(valid_); + Slice raw_value = (direction_ == kForward) ? iter_->value() : saved_value_; if (large_ == NULL) { - return value_; + return raw_value; } else { MutexLock l(&large_->mutex); if (!large_->produced) { - ReadIndirectValue(); + ReadIndirectValue(raw_value); } return large_->value; } } - - virtual void Next() { - assert(valid_); - // iter_ is already positioned past DBIter::key() - FindNextUserEntry(); - } - - virtual void Prev() { - assert(valid_); - bool ignored; - ScanUntilBeforeCurrentKey(&ignored); - FindPrevUserEntry(); - } - - virtual void Seek(const Slice& target) { - ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek); - std::string tmp; - AppendInternalKey(&tmp, ikey); - iter_->Seek(tmp); - FindNextUserEntry(); - } - virtual void SeekToFirst() { - iter_->SeekToFirst(); - FindNextUserEntry(); - } - - virtual void SeekToLast(); - virtual Status status() const { if (status_.ok()) { if (large_ != NULL && !large_->status.ok()) return large_->status; @@ -104,23 +88,13 @@ class DBIter: public Iterator { } } - private: - void FindNextUserEntry(); - void FindPrevUserEntry(); - void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); } - void SaveValue(const Slice& v) { - if (value_.capacity() > v.size() + 1048576) { - std::string empty; - swap(empty, value_); - } - value_.assign(v.data(), v.size()); - } - bool ParseKey(ParsedInternalKey* key); - void SkipPast(const Slice& k); - void ScanUntilBeforeCurrentKey(bool* found_live); - - void ReadIndirectValue() const; + virtual void Next(); + virtual void Prev(); + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + private: struct Large { port::Mutex mutex; std::string value; @@ -128,19 +102,42 @@ class DBIter: public Iterator { Status status; }; + void FindNextUserEntry(bool skipping, std::string* skip); + void FindPrevUserEntry(); + bool ParseKey(ParsedInternalKey* key); + void ReadIndirectValue(Slice ref) const; + + inline void SaveKey(const Slice& k, std::string* dst) { + dst->assign(k.data(), k.size()); + } + + inline void ForgetLargeValue() { + if (large_ != NULL) { + delete large_; + large_ = NULL; + } + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + const std::string* const dbname_; Env* const env_; - const Comparator* const user_comparator_; - - // iter_ is positioned just past current entry for DBIter if valid_ Iterator* const iter_; - SequenceNumber const sequence_; + Status status_; - std::string key_; // Always a user key - std::string value_; - Large* large_; // Non-NULL if value is an indirect reference + std::string saved_key_; // == current key when direction_==kReverse + std::string saved_value_; // == current raw value when direction_==kReverse + Large* large_; // Non-NULL if value is an indirect reference + Direction direction_; bool valid_; // No copying allowed @@ -157,204 +154,189 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { } } -void DBIter::FindNextUserEntry() { - if (large_ != NULL) { - if (status_.ok() && !large_->status.ok()) { - status_ = large_->status; - } - delete large_; - large_ = NULL; - } - while (iter_->Valid()) { - ParsedInternalKey ikey; - if (!ParseKey(&ikey)) { - // Skip past corrupted entry - iter_->Next(); - continue; - } - if (ikey.sequence > sequence_) { - // Ignore entries newer than the snapshot +void DBIter::Next() { + assert(valid_); + ForgetLargeValue(); + + if (direction_ == kReverse) { // Switch directions? + direction_ = kForward; + // iter_ is pointing just before the entries for this->key(), + // so advance into the range of entries for this->key() and then + // use the normal skipping code below. + if (!iter_->Valid()) { + iter_->SeekToFirst(); + } else { iter_->Next(); - continue; } - - switch (ikey.type) { - case kTypeDeletion: - SaveKey(ikey.user_key); // Make local copy for use by SkipPast() - iter_->Next(); - SkipPast(key_); - // Do not return deleted entries. Instead keep looping. - break; - - case kTypeValue: - SaveKey(ikey.user_key); - SaveValue(iter_->value()); - iter_->Next(); - SkipPast(key_); - // Yield the value we just found. - valid_ = true; - return; - - case kTypeLargeValueRef: - SaveKey(ikey.user_key); - // Save the large value ref as value_, and read it lazily on a call - // to value() - SaveValue(iter_->value()); - large_ = new Large; - large_->produced = false; - iter_->Next(); - SkipPast(key_); - // Yield the value we just found. - valid_ = true; - return; + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + return; } } - valid_ = false; - key_.clear(); - value_.clear(); - assert(large_ == NULL); + + // Temporarily use saved_key_ as storage for key to skip. + std::string* skip = &saved_key_; + SaveKey(ExtractUserKey(iter_->key()), skip); + FindNextUserEntry(true, skip); } -void DBIter::SkipPast(const Slice& k) { - while (iter_->Valid()) { +void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { + // Loop until we hit an acceptable entry to yield + assert(iter_->Valid()); + assert(direction_ == kForward); + assert(large_ == NULL); + do { ParsedInternalKey ikey; - // Note that if we cannot parse an internal key, we keep looping - // so that if we have a run like the following: - // => value100 - // - // => value50 - // we will skip over the corrupted entry as well as value50. - if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) { - break; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + SaveKey(ikey.user_key, skip); + skipping = true; + break; + case kTypeValue: + case kTypeLargeValueRef: + if (skipping && + user_comparator_->Compare(ikey.user_key, *skip) <= 0) { + // Entry hidden + } else { + valid_ = true; + saved_key_.clear(); + if (ikey.type == kTypeLargeValueRef) { + large_ = new Large; + large_->produced = false; + } + return; + } + break; + } } iter_->Next(); - } + } while (iter_->Valid()); + saved_key_.clear(); + valid_ = false; } -void DBIter::SeekToLast() { - // Position iter_ at the last uncorrupted user key and then - // let FindPrevUserEntry() do the heavy lifting to find - // a user key that is live. - iter_->SeekToLast(); - ParsedInternalKey current; - while (iter_->Valid() && !ParseKey(¤t)) { - iter_->Prev(); - } - if (iter_->Valid()) { - SaveKey(current.user_key); - } - FindPrevUserEntry(); -} +void DBIter::Prev() { + assert(valid_); + ForgetLargeValue(); -// Let X be the user key at which iter_ is currently positioned. -// Adjust DBIter to point at the last entry with a key <= X that -// has a live value. -void DBIter::FindPrevUserEntry() { - // Consider the following example: - // - // A@540 - // A@400 - // - // B@300 - // B@200 - // B@100 <- iter_ - // - // C@301 - // C@201 - // - // The comments marked "(first iteration)" below relate what happens - // for the preceding example in the first iteration of the while loop - // below. There may be more than one iteration either if there are - // no live values for B, or if there is a corruption. - while (iter_->Valid()) { - std::string saved = key_; - bool found_live; - ScanUntilBeforeCurrentKey(&found_live); - // (first iteration) iter_ at A@400 - if (found_live) { - // Step forward into range of entries with user key >= saved + if (direction_ == kForward) { // Switch directions? + // iter_ is pointing at the current entry. Scan backwards until + // the key changes so we can use the normal reverse scanning code. + assert(iter_->Valid()); // Otherwise valid_ would have been false + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + while (true) { + iter_->Prev(); if (!iter_->Valid()) { - iter_->SeekToFirst(); - } else { - iter_->Next(); - } - // (first iteration) iter_ at B@300 - - FindNextUserEntry(); // Sets key_ to the key of the next value it found - if (valid_ && user_comparator_->Compare(key_, saved) == 0) { - // (first iteration) iter_ at C@301 + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); return; } - - // FindNextUserEntry() could not find any entries under the - // user key "saved". This is probably a corruption since - // ScanUntilBefore(saved) found a live value. So we skip - // backwards to an earlier key and ignore the corrupted - // entries for "saved". - // - // (first iteration) iter_ at C@301 and saved == "B" - key_ = saved; - bool ignored; - ScanUntilBeforeCurrentKey(&ignored); - // (first iteration) iter_ at A@400 + if (user_comparator_->Compare(ExtractUserKey(iter_->key()), + saved_key_) < 0) { + break; + } } + direction_ = kReverse; } - valid_ = false; - key_.clear(); - value_.clear(); + + FindPrevUserEntry(); } -void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) { - *found_live = false; - if (!iter_->Valid()) { - iter_->SeekToLast(); - } +void DBIter::FindPrevUserEntry() { + assert(direction_ == kReverse); + assert(large_ == NULL); - while (iter_->Valid()) { - ParsedInternalKey current; - if (!ParseKey(¤t)) { + ValueType value_type = kTypeDeletion; + if (iter_->Valid()) { + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if ((value_type != kTypeDeletion) && + user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { + // We encountered a non-deleted value in entries for previous keys, + break; + } + value_type = ikey.type; + if (value_type == kTypeDeletion) { + ClearSavedValue(); + } else { + Slice raw_value = iter_->value(); + if (saved_value_.capacity() > raw_value.size() + 1048576) { + std::string empty; + swap(empty, saved_value_); + } + saved_value_.assign(raw_value.data(), raw_value.size()); + } + } iter_->Prev(); - continue; - } + } while (iter_->Valid()); + } - if (current.sequence > sequence_) { - // Ignore entries that are serialized after this read - iter_->Prev(); - continue; + if (value_type == kTypeDeletion) { + // End + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + direction_ = kForward; + } else { + valid_ = true; + if (value_type == kTypeLargeValueRef) { + large_ = new Large; + large_->produced = false; } + } +} - const int cmp = user_comparator_->Compare(current.user_key, key_); - if (cmp < 0) { - SaveKey(current.user_key); - return; - } else if (cmp == 0) { - switch (current.type) { - case kTypeDeletion: - *found_live = false; - break; - - case kTypeValue: - case kTypeLargeValueRef: - *found_live = true; - break; - } - } else { // cmp > 0 - *found_live = false; - } +void DBIter::Seek(const Slice& target) { + direction_ = kForward; + ForgetLargeValue(); + ClearSavedValue(); + saved_key_.clear(); + AppendInternalKey( + &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); + iter_->Seek(saved_key_); + if (iter_->Valid()) { + FindNextUserEntry(false, &saved_key_ /* temporary storage */); + } else { + valid_ = false; + } +} - iter_->Prev(); +void DBIter::SeekToFirst() { + direction_ = kForward; + ForgetLargeValue(); + ClearSavedValue(); + iter_->SeekToFirst(); + if (iter_->Valid()) { + FindNextUserEntry(false, &saved_key_ /* temporary storage */); + } else { + valid_ = false; } } -void DBIter::ReadIndirectValue() const { +void DBIter::SeekToLast() { + direction_ = kReverse; + ForgetLargeValue(); + ClearSavedValue(); + iter_->SeekToLast(); + FindPrevUserEntry(); +} + +void DBIter::ReadIndirectValue(Slice ref) const { assert(!large_->produced); large_->produced = true; LargeValueRef large_ref; - if (value_.size() != LargeValueRef::ByteSize()) { + if (ref.size() != LargeValueRef::ByteSize()) { large_->status = Status::Corruption("malformed large value reference"); return; } - memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize()); + memcpy(large_ref.data, ref.data(), LargeValueRef::ByteSize()); std::string fname = LargeValueFileName(*dbname_, large_ref); RandomAccessFile* file; Status s = env_->NewRandomAccessFile(fname, &file); diff --git a/db/db_test.cc b/db/db_test.cc index 888c560..0414176 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -209,6 +209,16 @@ class DBTest { } } } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } }; TEST(DBTest, Empty) { @@ -234,6 +244,180 @@ TEST(DBTest, PutDeleteGet) { ASSERT_EQ("NOT_FOUND", Get("foo")); } +TEST(DBTest, IterEmpty) { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterSingle) { + ASSERT_OK(Put("a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterMulti) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put("a", "va2")); + ASSERT_OK(Put("a2", "va3")); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Put("c", "vc2")); + ASSERT_OK(Delete("b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterSmallAndLargeMix) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", std::string(100000, 'b'))); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Put("d", std::string(100000, 'd'))); + ASSERT_OK(Put("e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + TEST(DBTest, Recover) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("baz", "v5")); diff --git a/doc/index.html b/doc/index.html index 53471d2..e0baf2e 100644 --- a/doc/index.html +++ b/doc/index.html @@ -132,8 +132,7 @@ range [start,limit): } You can also process entries in reverse order. (Caveat: reverse -iteration is currently a factor of two or three slower than forward -iteration.) +iteration may be somewhat slower than forward iteration.)

   for (it->SeekToLast(); it->Valid(); it->Prev()) {
diff --git a/port/sha1_test.cc b/port/sha1_test.cc
index 46bbeba..b182e67 100644
--- a/port/sha1_test.cc
+++ b/port/sha1_test.cc
@@ -31,22 +31,6 @@ TEST(SHA1, Simple) {
             TestSHA1(x.data(), x.size()));
 }
 
-TEST(SHA1, Benchmark) {
-  std::string data(1048576 * 100, 'x');
-  double start = Env::Default()->NowMicros() * 1e-6;
-  static const int kIters = 10;
-  uint32_t sha1 = 0;
-  for (int i = 0; i < kIters; i++) {
-    char hash_val[20];
-    SHA1_Hash(data.data(), data.size(), hash_val);
-    sha1 |= hash_val[0];
-  }
-  double finish = Env::Default()->NowMicros() * 1e-6;
-  double mb = (static_cast(data.size()) * kIters) / 1048576.0;
-  fprintf(stderr, "SHA1 %0.0f MB: %.3f secs; %.1f MB/s, dummy=0x%02x\n",
-          mb, (finish - start), mb / (finish - start), sha1);
-}
-
 }
 }
 
diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc
index a7fc758..ba9e804 100644
--- a/util/crc32c_test.cc
+++ b/util/crc32c_test.cc
@@ -64,20 +64,6 @@ TEST(CRC, Mask) {
   ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
 }
 
-TEST(CRC, Benchmark) {
-  std::string data(1048576 * 100, 'x');
-  double start = Env::Default()->NowMicros() * 1e-6;
-  static const int kIters = 10;
-  uint32_t crc = 0;
-  for (int i = 0; i < kIters; i++) {
-    crc |= Value(data.data(), data.size());
-  }
-  double finish = Env::Default()->NowMicros() * 1e-6;
-  double mb = (static_cast(data.size()) * kIters) / 1048576.0;
-  fprintf(stderr, "CRC %0.0f MB: %.3f secs; %.1f MB/s, crc=0x%08x\n",
-          mb, (finish - start), mb / (finish - start), crc);
-}
-
 }
 }
 
-- 
cgit v1.2.3


From f85ede82f8c27a00c3120f67fbab89b2a89fe987 Mon Sep 17 00:00:00 2001
From: "jorlow@chromium.org"
 
Date: Mon, 28 Mar 2011 20:43:44 +0000
Subject: Upstream changes.

git-svn-id: http://leveldb.googlecode.com/svn/trunk@16 62dab493-f737-651d-591e-8d6aee1b9529
---
 db/builder.cc        |  4 +++-
 db/db_bench.cc       |  4 ++--
 db/db_impl.cc        |  4 +++-
 db/db_iter.cc        |  5 ++++-
 db/log_reader.cc     |  6 ++----
 db/log_writer.cc     | 11 ++++++-----
 db/repair.cc         |  2 +-
 db/table_cache.cc    |  3 ++-
 db/table_cache.h     | 15 ++++++++-------
 db/version_set.cc    | 22 ++++++++++++++--------
 doc/log_format.txt   | 15 +++++++++------
 include/env.h        |  3 ---
 include/table.h      |  6 ++++--
 table/table.cc       |  2 +-
 table/table_test.cc  |  4 ++--
 util/env_chromium.cc | 16 +++-------------
 util/env_posix.cc    | 16 +++-------------
 17 files changed, 67 insertions(+), 71 deletions(-)

diff --git a/db/builder.cc b/db/builder.cc
index f3d0fe2..d5585c3 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -74,7 +74,9 @@ Status BuildTable(const std::string& dbname,
 
     if (s.ok()) {
       // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number);
+      Iterator* it = table_cache->NewIterator(ReadOptions(),
+                                              meta->number,
+                                              meta->file_size);
       s = it->status();
       delete it;
     }
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 7026ca1..c7a662d 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -354,7 +354,7 @@ class Benchmark {
  private:
   void Crc32c(int size, const char* label) {
     // Checksum about 500MB of data total
-    string data(size, 'x');
+    std::string data(size, 'x');
     int64_t bytes = 0;
     uint32_t crc = 0;
     while (bytes < 500 * 1048576) {
@@ -371,7 +371,7 @@ class Benchmark {
 
   void SHA1(int size, const char* label) {
     // SHA1 about 100MB of data total
-    string data(size, 'x');
+    std::string data(size, 'x');
     int64_t bytes = 0;
     char sha1[20];
     while (bytes < 100 * 1048576) {
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 12c02b3..f14167a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -642,7 +642,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
 
   if (s.ok() && current_entries > 0) {
     // Verify that the table is usable
-    Iterator* iter = table_cache_->NewIterator(ReadOptions(),output_number);
+    Iterator* iter = table_cache_->NewIterator(ReadOptions(),
+                                               output_number,
+                                               current_bytes);
     s = iter->status();
     delete iter;
     if (s.ok()) {
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 6726b51..beb4d74 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -340,8 +340,11 @@ void DBIter::ReadIndirectValue(Slice ref) const {
   std::string fname = LargeValueFileName(*dbname_, large_ref);
   RandomAccessFile* file;
   Status s = env_->NewRandomAccessFile(fname, &file);
+  uint64_t file_size = 0;
+  if (s.ok()) {
+    s = env_->GetFileSize(fname, &file_size);
+  }
   if (s.ok()) {
-    uint64_t file_size = file->Size();
     uint64_t value_size = large_ref.ValueSize();
     large_->value.resize(value_size);
     Slice result;
diff --git a/db/log_reader.cc b/db/log_reader.cc
index 39a6d2b..407700d 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -105,7 +105,7 @@ void Reader::ReportDrop(size_t bytes, const char* reason) {
 
 unsigned int Reader::ReadPhysicalRecord(Slice* result) {
   while (true) {
-    if (buffer_.size() <= kHeaderSize) {
+    if (buffer_.size() < kHeaderSize) {
       if (!eof_) {
         // Last read was a full read, so this is a trailer to skip
         buffer_.clear();
@@ -124,12 +124,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
       } else if (buffer_.size() == 0) {
         // End of file
         return kEof;
-      } else if (buffer_.size() < kHeaderSize) {
+      } else {
         ReportDrop(buffer_.size(), "truncated record at end of file");
         buffer_.clear();
         return kEof;
-      } else {
-        // We have a trailing zero-length record.  Fall through and check it.
       }
     }
 
diff --git a/db/log_writer.cc b/db/log_writer.cc
index 465eca2..fc33e6e 100644
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -35,18 +35,19 @@ Status Writer::AddRecord(const Slice& slice) {
   do {
     const int leftover = kBlockSize - block_offset_;
     assert(leftover >= 0);
-    if (leftover <= kHeaderSize) {
+    if (leftover < kHeaderSize) {
       // Switch to a new block
       if (leftover > 0) {
-        // Fill the trailer
-        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover));
+        // Fill the trailer (literal below relies on kHeaderSize being 7)
+        assert(kHeaderSize == 7);
+        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
       }
       block_offset_ = 0;
     }
 
-    // Invariant: we never leave <= kHeaderSize bytes in a block.
+    // Invariant: we never leave < kHeaderSize bytes in a block.
     const int avail = kBlockSize - block_offset_ - kHeaderSize;
-    assert(avail > 0);
+    assert(avail >= 0);
 
     const size_t fragment_length = (left < avail) ? left : avail;
 
diff --git a/db/repair.cc b/db/repair.cc
index 0727914..745b31a 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -261,7 +261,7 @@ class Repairer {
     Status status = env_->GetFileSize(fname, &t->meta.file_size);
     if (status.ok()) {
       Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), t->meta.number);
+          ReadOptions(), t->meta.number, t->meta.file_size);
       bool empty = true;
       ParsedInternalKey parsed;
       t->max_sequence = 0;
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 604298d..6f750d6 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -44,6 +44,7 @@ TableCache::~TableCache() {
 
 Iterator* TableCache::NewIterator(const ReadOptions& options,
                                   uint64_t file_number,
+                                  uint64_t file_size,
                                   Table** tableptr) {
   if (tableptr != NULL) {
     *tableptr = NULL;
@@ -59,7 +60,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
     Table* table = NULL;
     Status s = env_->NewRandomAccessFile(fname, &file);
     if (s.ok()) {
-      s = Table::Open(*options_, file, &table);
+      s = Table::Open(*options_, file, file_size, &table);
     }
 
     if (!s.ok()) {
diff --git a/db/table_cache.h b/db/table_cache.h
index 6c357df..5564dfc 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -23,15 +23,16 @@ class TableCache {
   TableCache(const std::string& dbname, const Options* options, int entries);
   ~TableCache();
 
-  // Get an iterator for the specified file number and return it.  If
-  // "tableptr" is non-NULL, also sets "*tableptr" to point to the
-  // Table object underlying the returned iterator, or NULL if no
-  // Table object underlies the returned iterator.  The returned
-  // "*tableptr" object is owned by the cache and should not be
-  // deleted, and is valid for as long as the returned iterator is
-  // live.
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "tableptr" is
+  // non-NULL, also sets "*tableptr" to point to the Table object
+  // underlying the returned iterator, or NULL if no Table object underlies
+  // the returned iterator.  The returned "*tableptr" object is owned by
+  // the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
   Iterator* NewIterator(const ReadOptions& options,
                         uint64_t file_number,
+                        uint64_t file_size,
                         Table** tableptr = NULL);
 
   // Evict any entry for the specified file number
diff --git a/db/version_set.cc b/db/version_set.cc
index caf0b2d..b826e5b 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -75,8 +75,8 @@ Version::~Version() {
 // An internal iterator.  For a given version/level pair, yields
 // information about the files in the level.  For a given entry, key()
 // is the largest key that occurs in the file, and value() is an
-// 8-byte value containing the file number of the file, encoding using
-// EncodeFixed64.
+// 16-byte value containing the file number and file size, both
+// encoded using EncodeFixed64.
 class Version::LevelFileNumIterator : public Iterator {
  public:
   LevelFileNumIterator(const Version* version,
@@ -129,6 +129,7 @@ class Version::LevelFileNumIterator : public Iterator {
   Slice value() const {
     assert(Valid());
     EncodeFixed64(value_buf_, (*flist_)[index_]->number);
+    EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
     return Slice(value_buf_, sizeof(value_buf_));
   }
   virtual Status status() const { return Status::OK(); }
@@ -137,18 +138,21 @@ class Version::LevelFileNumIterator : public Iterator {
   const std::vector* const flist_;
   int index_;
 
-  mutable char value_buf_[8];  // Used for encoding the file number for value()
+  // Backing store for value().  Holds the file number and size.
+  mutable char value_buf_[16];
 };
 
 static Iterator* GetFileIterator(void* arg,
                                  const ReadOptions& options,
                                  const Slice& file_value) {
   TableCache* cache = reinterpret_cast(arg);
-  if (file_value.size() != 8) {
+  if (file_value.size() != 16) {
     return NewErrorIterator(
         Status::Corruption("FileReader invoked with unexpected value"));
   } else {
-    return cache->NewIterator(options, DecodeFixed64(file_value.data()));
+    return cache->NewIterator(options,
+                              DecodeFixed64(file_value.data()),
+                              DecodeFixed64(file_value.data() + 8));
   }
 }
 
@@ -164,7 +168,8 @@ void Version::AddIterators(const ReadOptions& options,
   // Merge all level zero files together since they may overlap
   for (int i = 0; i < files_[0].size(); i++) {
     iters->push_back(
-        vset_->table_cache_->NewIterator(options, files_[0][i]->number));
+        vset_->table_cache_->NewIterator(
+            options, files_[0][i]->number, files_[0][i]->file_size));
   }
 
   // For levels > 0, we can use a concatenating iterator that sequentially
@@ -650,7 +655,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
         // approximate offset of "ikey" within the table.
         Table* tableptr;
         Iterator* iter = table_cache_->NewIterator(
-            ReadOptions(), files[i]->number, &tableptr);
+            ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
         if (tableptr != NULL) {
           result += tableptr->ApproximateOffsetOf(ikey.Encode());
         }
@@ -855,7 +860,8 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
       if (c->level() + which == 0) {
         const std::vector& files = c->inputs_[which];
         for (int i = 0; i < files.size(); i++) {
-          list[num++] = table_cache_->NewIterator(options, files[i]->number);
+          list[num++] = table_cache_->NewIterator(
+              options, files[i]->number, files[i]->file_size);
         }
       } else {
         // Create concatenating iterator for the files from this level
diff --git a/doc/log_format.txt b/doc/log_format.txt
index 9a801d4..3a0414b 100644
--- a/doc/log_format.txt
+++ b/doc/log_format.txt
@@ -9,12 +9,15 @@ Each block consists of a sequence of records:
 	type: uint8		// One of FULL, FIRST, MIDDLE, LAST
 	data: uint8[length]
 
-A record never starts within the last seven bytes of a block.  Any
-leftover bytes here form the trailer, which must consist entirely of
-zero bytes and must be skipped by readers.  In particular, even if
-there are exactly seven bytes left in the block, and a zero-length
-user record is added (which will fit in these seven bytes), the writer
-must skip these trailer bytes and add the record to the next block.
+A record never starts within the last six bytes of a block (since it
+won't fit).  Any leftover bytes here form the trailer, which must
+consist entirely of zero bytes and must be skipped by readers.  
+
+Aside: if exactly seven bytes are left in the current block, and a new
+non-zero length record is added, the writer must emit a FIRST record
+(which contains zero bytes of user data) to fill up the trailing seven
+bytes of the block and then emit all of the user data in subsequent
+blocks.
 
 More types may be added in the future.  Some Readers may skip record
 types they do not understand, others may report that some data was
diff --git a/include/env.h b/include/env.h
index a728f29..14ddf29 100644
--- a/include/env.h
+++ b/include/env.h
@@ -168,9 +168,6 @@ class RandomAccessFile {
   RandomAccessFile() { }
   virtual ~RandomAccessFile();
 
-  // Return the length of this file in bytes.
-  virtual uint64_t Size() const = 0;
-
   // Read up to "n" bytes from the file starting at "offset".
   // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
   // to the data that was read (including if fewer than "n" bytes were
diff --git a/include/table.h b/include/table.h
index 96b2196..c2a4cf9 100644
--- a/include/table.h
+++ b/include/table.h
@@ -20,8 +20,9 @@ struct ReadOptions;
 // immutable and persistent.
 class Table {
  public:
-  // Attempt to open the table that is stored in "file", and read the
-  // metadata entries necessary to allow retrieving data from the table.
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
   //
   // If successful, returns ok and sets "*table" to the newly opened
   // table.  The client should delete "*table" when no longer needed.
@@ -33,6 +34,7 @@ class Table {
   // *file must remain live while this Table is in use.
   static Status Open(const Options& options,
                      RandomAccessFile* file,
+                     uint64_t file_size,
                      Table** table);
 
   ~Table();
diff --git a/table/table.cc b/table/table.cc
index dffc217..bd0fbb5 100644
--- a/table/table.cc
+++ b/table/table.cc
@@ -29,9 +29,9 @@ struct Table::Rep {
 
 Status Table::Open(const Options& options,
                    RandomAccessFile* file,
+                   uint64_t size,
                    Table** table) {
   *table = NULL;
-  const uint64_t size = file->Size();
   if (size < Footer::kEncodedLength) {
     return Status::InvalidArgument("file is too short to be an sstable");
   }
diff --git a/table/table_test.cc b/table/table_test.cc
index eabb257..d67c58b 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -110,7 +110,7 @@ class StringSource: public RandomAccessFile {
 
   virtual ~StringSource() { }
 
-  virtual uint64_t Size() const { return contents_.size(); }
+  uint64_t Size() const { return contents_.size(); }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                        char* scratch) const {
@@ -246,7 +246,7 @@ class TableConstructor: public Constructor {
     source_ = new StringSource(sink.contents());
     Options table_options;
     table_options.comparator = options.comparator;
-    return Table::Open(table_options, source_, &table_);
+    return Table::Open(table_options, source_, sink.contents().size(), &table_);
   }
   virtual size_t NumBytes() const { return source_->Size(); }
 
diff --git a/util/env_chromium.cc b/util/env_chromium.cc
index d79406f..834ec2d 100644
--- a/util/env_chromium.cc
+++ b/util/env_chromium.cc
@@ -144,17 +144,13 @@ class ChromiumSequentialFile: public SequentialFile {
 class ChromiumRandomAccessFile: public RandomAccessFile {
  private:
   std::string filename_;
-  uint64_t size_;
   ::base::PlatformFile file_;
 
  public:
-  ChromiumRandomAccessFile(const std::string& fname, uint64_t size,
-                           ::base::PlatformFile file)
-      : filename_(fname), size_(size), file_(file) { }
+  ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file)
+      : filename_(fname), file_(file) { }
   virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); }
 
-  virtual uint64_t Size() const { return size_; }
-
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {
     Status s;
@@ -256,13 +252,7 @@ class ChromiumEnv : public Env {
       *result = NULL;
       return Status::IOError(fname, PlatformFileErrorString(error_code));
     }
-    ::base::PlatformFileInfo info;
-    if (!::base::GetPlatformFileInfo(file, &info)) {
-      *result = NULL;
-      ::base::ClosePlatformFile(file);
-      return Status::IOError(fname, PlatformFileErrorString(error_code));
-    }
-    *result = new ChromiumRandomAccessFile(fname, info.size, file);
+    *result = new ChromiumRandomAccessFile(fname, file);
     return Status::OK();
   }
 
diff --git a/util/env_posix.cc b/util/env_posix.cc
index f5174d3..5c58449 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -57,16 +57,13 @@ class PosixSequentialFile: public SequentialFile {
 class PosixRandomAccessFile: public RandomAccessFile {
  private:
   std::string filename_;
-  uint64_t size_;
   int fd_;
 
  public:
-  PosixRandomAccessFile(const std::string& fname, uint64_t size, int fd)
-      : filename_(fname), size_(size), fd_(fd) { }
+  PosixRandomAccessFile(const std::string& fname, int fd)
+      : filename_(fname), fd_(fd) { }
   virtual ~PosixRandomAccessFile() { close(fd_); }
 
-  virtual uint64_t Size() const { return size_; }
-
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {
     Status s;
@@ -286,14 +283,7 @@ class PosixEnv : public Env {
       *result = NULL;
       return Status::IOError(fname, strerror(errno));
     }
-    struct stat sbuf;
-    if (fstat(fd, &sbuf) != 0) {
-      *result = NULL;
-      Status s = Status::IOError(fname, strerror(errno));
-      close(fd);
-      return s;
-    }
-    *result = new PosixRandomAccessFile(fname, sbuf.st_size, fd);
+    *result = new PosixRandomAccessFile(fname, fd);
     return Status::OK();
   }
 
-- 
cgit v1.2.3


From c2dc30f8224d3f84b917abb6d9657ca2c7c9b2a5 Mon Sep 17 00:00:00 2001
From: "jorlow@chromium.org"
 
Date: Tue, 29 Mar 2011 22:41:11 +0000
Subject: Upstream change.

git-svn-id: http://leveldb.googlecode.com/svn/trunk@17 62dab493-f737-651d-591e-8d6aee1b9529
---
 util/env_chromium.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/env_chromium.cc b/util/env_chromium.cc
index 834ec2d..3cbf190 100644
--- a/util/env_chromium.cc
+++ b/util/env_chromium.cc
@@ -9,10 +9,10 @@
 #include "base/file_path.h"
 #include "base/file_util.h"
 #include "base/lazy_instance.h"
+#include "base/memory/ref_counted.h"
 #include "base/message_loop.h"
 #include "base/platform_file.h"
 #include "base/process_util.h"
-#include "base/ref_counted.h"
 #include "base/synchronization/lock.h"
 #include "base/sys_info.h"
 #include "base/task.h"
-- 
cgit v1.2.3


From fbd97aa4c5325eace57d24b89845b9581bac9324 Mon Sep 17 00:00:00 2001
From: "jorlow@chromium.org"
 
Date: Wed, 30 Mar 2011 18:35:40 +0000
Subject: Move include files into a leveldb subdir.

git-svn-id: http://leveldb.googlecode.com/svn/trunk@18 62dab493-f737-651d-591e-8d6aee1b9529
---
 Makefile                    |  2 +-
 db/builder.cc               |  6 +++---
 db/builder.h                |  2 +-
 db/corruption_test.cc       |  8 ++++----
 db/db_bench.cc              |  8 ++++----
 db/db_impl.cc               | 10 +++++-----
 db/db_impl.h                |  4 ++--
 db/db_iter.cc               |  4 ++--
 db/db_iter.h                |  2 +-
 db/db_test.cc               |  6 +++---
 db/dbformat.h               |  8 ++++----
 db/filename.cc              |  2 +-
 db/filename.h               |  4 ++--
 db/log_reader.cc            |  2 +-
 db/log_reader.h             |  4 ++--
 db/log_test.cc              |  2 +-
 db/log_writer.cc            |  2 +-
 db/log_writer.h             |  4 ++--
 db/memtable.cc              |  6 +++---
 db/memtable.h               |  2 +-
 db/repair.cc                |  6 +++---
 db/skiplist_test.cc         |  2 +-
 db/snapshot.h               |  2 +-
 db/table_cache.cc           |  4 ++--
 db/table_cache.h            |  4 ++--
 db/version_set.cc           |  4 ++--
 db/write_batch.cc           |  4 ++--
 db/write_batch_internal.h   |  2 +-
 db/write_batch_test.cc      |  4 ++--
 leveldb.gyp                 | 28 +++++++++++++++++-----------
 table/block.cc              |  2 +-
 table/block.h               |  2 +-
 table/block_builder.cc      |  4 ++--
 table/block_builder.h       |  2 +-
 table/format.cc             |  2 +-
 table/format.h              |  6 +++---
 table/iterator.cc           |  2 +-
 table/merger.cc             |  4 ++--
 table/table.cc              |  6 +++---
 table/table_builder.cc      |  6 +++---
 table/table_test.cc         | 10 +++++-----
 table/two_level_iterator.cc |  2 +-
 table/two_level_iterator.h  |  2 +-
 util/cache.cc               |  2 +-
 util/cache_test.cc          |  2 +-
 util/coding.h               |  2 +-
 util/comparator.cc          |  4 ++--
 util/env.cc                 |  2 +-
 util/env_chromium.cc        |  4 ++--
 util/env_posix.cc           |  4 ++--
 util/env_test.cc            |  2 +-
 util/logging.cc             |  4 ++--
 util/options.cc             |  6 +++---
 util/status.cc              |  2 +-
 util/testharness.h          |  4 ++--
 util/testutil.h             |  4 ++--
 56 files changed, 123 insertions(+), 117 deletions(-)

diff --git a/Makefile b/Makefile
index 8fbcddf..7569701 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ CC = g++
 #OPT = -O2 -DNDEBUG
 OPT = -g2
 
-CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -std=c++0x $(OPT)
+CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -I./include -std=c++0x $(OPT)
 
 LDFLAGS=-lpthread
 
diff --git a/db/builder.cc b/db/builder.cc
index d5585c3..6c8e6b8 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -8,9 +8,9 @@
 #include "db/dbformat.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
-#include "include/db.h"
-#include "include/env.h"
-#include "include/iterator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
 
 namespace leveldb {
 
diff --git a/db/builder.h b/db/builder.h
index 2d8afdf..4efcb04 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -5,7 +5,7 @@
 #ifndef STORAGE_LEVELDB_DB_BUILDER_H_
 #define STORAGE_LEVELDB_DB_BUILDER_H_
 
-#include "include/status.h"
+#include "leveldb/status.h"
 
 namespace leveldb {
 
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 1f4f26c..de9408c 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -2,15 +2,15 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/db.h"
+#include "leveldb/db.h"
 
 #include 
 #include 
 #include 
 #include 
-#include "include/env.h"
-#include "include/table.h"
-#include "include/write_batch.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
 #include "db/log_format.h"
diff --git a/db/db_bench.cc b/db/db_bench.cc
index c7a662d..411493c 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -7,10 +7,10 @@
 #include 
 #include "db/db_impl.h"
 #include "db/version_set.h"
-#include "include/cache.h"
-#include "include/db.h"
-#include "include/env.h"
-#include "include/write_batch.h"
+#include "leveldb/cache.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/write_batch.h"
 #include "port/port.h"
 #include "util/crc32c.h"
 #include "util/histogram.h"
diff --git a/db/db_impl.cc b/db/db_impl.cc
index f14167a..cf5471b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -20,11 +20,11 @@
 #include "db/table_cache.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
-#include "include/db.h"
-#include "include/env.h"
-#include "include/status.h"
-#include "include/table.h"
-#include "include/table_builder.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/status.h"
+#include "leveldb/table.h"
+#include "leveldb/table_builder.h"
 #include "port/port.h"
 #include "table/block.h"
 #include "table/merger.h"
diff --git a/db/db_impl.h b/db/db_impl.h
index 6e98e3c..49ac37b 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -9,8 +9,8 @@
 #include "db/dbformat.h"
 #include "db/log_writer.h"
 #include "db/snapshot.h"
-#include "include/db.h"
-#include "include/env.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
 #include "port/port.h"
 
 namespace leveldb {
diff --git a/db/db_iter.cc b/db/db_iter.cc
index beb4d74..31c2a38 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -6,8 +6,8 @@
 
 #include "db/filename.h"
 #include "db/dbformat.h"
-#include "include/env.h"
-#include "include/iterator.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
 #include "port/port.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
diff --git a/db/db_iter.h b/db/db_iter.h
index a0be50e..195f3d3 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -6,7 +6,7 @@
 #define STORAGE_LEVELDB_DB_DB_ITER_H_
 
 #include 
-#include "include/db.h"
+#include "leveldb/db.h"
 #include "db/dbformat.h"
 
 namespace leveldb {
diff --git a/db/db_test.cc b/db/db_test.cc
index 0414176..f68e759 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2,14 +2,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/db.h"
+#include "leveldb/db.h"
 
 #include "db/db_impl.h"
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
-#include "include/env.h"
-#include "include/table.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
diff --git a/db/dbformat.h b/db/dbformat.h
index e784457..6f34cd1 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -6,10 +6,10 @@
 #define STORAGE_LEVELDB_DB_FORMAT_H_
 
 #include 
-#include "include/comparator.h"
-#include "include/db.h"
-#include "include/slice.h"
-#include "include/table_builder.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/slice.h"
+#include "leveldb/table_builder.h"
 #include "util/coding.h"
 #include "util/logging.h"
 
diff --git a/db/filename.cc b/db/filename.cc
index 55e6d28..d21918c 100644
--- a/db/filename.cc
+++ b/db/filename.cc
@@ -6,7 +6,7 @@
 #include 
 #include "db/filename.h"
 #include "db/dbformat.h"
-#include "include/env.h"
+#include "leveldb/env.h"
 #include "util/logging.h"
 
 namespace leveldb {
diff --git a/db/filename.h b/db/filename.h
index 3fd2ea4..81ab2fc 100644
--- a/db/filename.h
+++ b/db/filename.h
@@ -9,8 +9,8 @@
 
 #include 
 #include 
-#include "include/slice.h"
-#include "include/status.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
 #include "port/port.h"
 
 namespace leveldb {
diff --git a/db/log_reader.cc b/db/log_reader.cc
index 407700d..75e1d28 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -5,7 +5,7 @@
 #include "db/log_reader.h"
 
 #include 
-#include "include/env.h"
+#include "leveldb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 
diff --git a/db/log_reader.h b/db/log_reader.h
index 515d2af..baf1475 100644
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -6,8 +6,8 @@
 #define STORAGE_LEVELDB_DB_LOG_READER_H_
 
 #include "db/log_format.h"
-#include "include/slice.h"
-#include "include/status.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
 
 namespace leveldb {
 
diff --git a/db/log_test.cc b/db/log_test.cc
index 5fa20aa..025a5ff 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -4,7 +4,7 @@
 
 #include "db/log_reader.h"
 #include "db/log_writer.h"
-#include "include/env.h"
+#include "leveldb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/random.h"
diff --git a/db/log_writer.cc b/db/log_writer.cc
index fc33e6e..18ca37a 100644
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -5,7 +5,7 @@
 #include "db/log_writer.h"
 
 #include 
-#include "include/env.h"
+#include "leveldb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 
diff --git a/db/log_writer.h b/db/log_writer.h
index 13c64ba..d3cf27d 100644
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -7,8 +7,8 @@
 
 #include 
 #include "db/log_format.h"
-#include "include/slice.h"
-#include "include/status.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
 
 namespace leveldb {
 
diff --git a/db/memtable.cc b/db/memtable.cc
index 349cfcc..a3b618a 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -4,9 +4,9 @@
 
 #include "db/memtable.h"
 #include "db/dbformat.h"
-#include "include/comparator.h"
-#include "include/env.h"
-#include "include/iterator.h"
+#include "leveldb/comparator.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
 #include "util/coding.h"
 
 namespace leveldb {
diff --git a/db/memtable.h b/db/memtable.h
index fa95e15..45b3342 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -6,7 +6,7 @@
 #define STORAGE_LEVELDB_DB_MEMTABLE_H_
 
 #include 
-#include "include/db.h"
+#include "leveldb/db.h"
 #include "db/dbformat.h"
 #include "db/skiplist.h"
 #include "util/arena.h"
diff --git a/db/repair.cc b/db/repair.cc
index 745b31a..014e00e 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -36,9 +36,9 @@
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "db/write_batch_internal.h"
-#include "include/comparator.h"
-#include "include/db.h"
-#include "include/env.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
 
 namespace leveldb {
 
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
index 0cfc893..5f9ec0d 100644
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -4,7 +4,7 @@
 
 #include "db/skiplist.h"
 #include 
-#include "include/env.h"
+#include "leveldb/env.h"
 #include "util/arena.h"
 #include "util/hash.h"
 #include "util/random.h"
diff --git a/db/snapshot.h b/db/snapshot.h
index 6a07f80..9a90756 100644
--- a/db/snapshot.h
+++ b/db/snapshot.h
@@ -5,7 +5,7 @@
 #ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
 #define STORAGE_LEVELDB_DB_SNAPSHOT_H_
 
-#include "include/db.h"
+#include "leveldb/db.h"
 
 namespace leveldb {
 
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 6f750d6..325d707 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -5,8 +5,8 @@
 #include "db/table_cache.h"
 
 #include "db/filename.h"
-#include "include/env.h"
-#include "include/table.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
 #include "util/coding.h"
 
 namespace leveldb {
diff --git a/db/table_cache.h b/db/table_cache.h
index 5564dfc..5376194 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -10,8 +10,8 @@
 #include 
 #include 
 #include "db/dbformat.h"
-#include "include/cache.h"
-#include "include/table.h"
+#include "leveldb/cache.h"
+#include "leveldb/table.h"
 #include "port/port.h"
 
 namespace leveldb {
diff --git a/db/version_set.cc b/db/version_set.cc
index b826e5b..dc9b418 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -11,8 +11,8 @@
 #include "db/log_writer.h"
 #include "db/memtable.h"
 #include "db/table_cache.h"
-#include "include/env.h"
-#include "include/table_builder.h"
+#include "leveldb/env.h"
+#include "leveldb/table_builder.h"
 #include "table/merger.h"
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
diff --git a/db/write_batch.cc b/db/write_batch.cc
index b6c4979..e84e548 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -14,9 +14,9 @@
 //    len: varint32
 //    data: uint8[len]
 
-#include "include/write_batch.h"
+#include "leveldb/write_batch.h"
 
-#include "include/db.h"
+#include "leveldb/db.h"
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index df750c7..ea28e2d 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -5,7 +5,7 @@
 #ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
 #define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
 
-#include "include/write_batch.h"
+#include "leveldb/write_batch.h"
 
 namespace leveldb {
 
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 4963579..deb8411 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -2,11 +2,11 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/db.h"
+#include "leveldb/db.h"
 
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
-#include "include/env.h"
+#include "leveldb/env.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 
diff --git a/leveldb.gyp b/leveldb.gyp
index eb809f3..d10ac33 100644
--- a/leveldb.gyp
+++ b/leveldb.gyp
@@ -12,6 +12,7 @@
     ],
     'include_dirs': [
       '.',
+      'include/',
     ],
     'conditions': [
       ['OS == "win"', {
@@ -42,6 +43,11 @@
           ],
         }],
       ],
+      'direct_dependent_settings': {
+        'include_dirs': [
+          'include/',
+        ],
+      },
       'sources': [
         # Include and then exclude so that all files show up in IDEs, even if
         # they don't build.
@@ -73,17 +79,17 @@
         'db/version_set.h',
         'db/write_batch.cc',
         'db/write_batch_internal.h',
-        'include/cache.h',
-        'include/comparator.h',
-        'include/db.h',
-        'include/env.h',
-        'include/iterator.h',
-        'include/options.h',
-        'include/slice.h',
-        'include/status.h',
-        'include/table.h',
-        'include/table_builder.h',
-        'include/write_batch.h',
+        'include/leveldb/cache.h',
+        'include/leveldb/comparator.h',
+        'include/leveldb/db.h',
+        'include/leveldb/env.h',
+        'include/leveldb/iterator.h',
+        'include/leveldb/options.h',
+        'include/leveldb/slice.h',
+        'include/leveldb/status.h',
+        'include/leveldb/table.h',
+        'include/leveldb/table_builder.h',
+        'include/leveldb/write_batch.h',
         'port/port.h',
         'port/port_chromium.cc',
         'port/port_chromium.h',
diff --git a/table/block.cc b/table/block.cc
index 351eb48..0525d2d 100644
--- a/table/block.cc
+++ b/table/block.cc
@@ -8,7 +8,7 @@
 
 #include 
 #include 
-#include "include/comparator.h"
+#include "leveldb/comparator.h"
 #include "util/coding.h"
 #include "util/logging.h"
 
diff --git a/table/block.h b/table/block.h
index 9372001..cdf0598 100644
--- a/table/block.h
+++ b/table/block.h
@@ -7,7 +7,7 @@
 
 #include 
 #include 
-#include "include/iterator.h"
+#include "leveldb/iterator.h"
 
 namespace leveldb {
 
diff --git a/table/block_builder.cc b/table/block_builder.cc
index 2c33492..ae18b36 100644
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@@ -30,8 +30,8 @@
 
 #include 
 #include 
-#include "include/comparator.h"
-#include "include/table_builder.h"
+#include "leveldb/comparator.h"
+#include "leveldb/table_builder.h"
 #include "util/coding.h"
 
 namespace leveldb {
diff --git a/table/block_builder.h b/table/block_builder.h
index beab168..bf92a0f 100644
--- a/table/block_builder.h
+++ b/table/block_builder.h
@@ -8,7 +8,7 @@
 #include 
 
 #include 
-#include "include/slice.h"
+#include "leveldb/slice.h"
 
 namespace leveldb {
 
diff --git a/table/format.cc b/table/format.cc
index 191a9bd..8c6b0f3 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -4,7 +4,7 @@
 
 #include "table/format.h"
 
-#include "include/env.h"
+#include "leveldb/env.h"
 #include "port/port.h"
 #include "table/block.h"
 #include "util/coding.h"
diff --git a/table/format.h b/table/format.h
index 03e3ee2..a6ab964 100644
--- a/table/format.h
+++ b/table/format.h
@@ -7,9 +7,9 @@
 
 #include 
 #include 
-#include "include/slice.h"
-#include "include/status.h"
-#include "include/table_builder.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+#include "leveldb/table_builder.h"
 
 namespace leveldb {
 
diff --git a/table/iterator.cc b/table/iterator.cc
index f3c0856..4ddd55f 100644
--- a/table/iterator.cc
+++ b/table/iterator.cc
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/iterator.h"
+#include "leveldb/iterator.h"
 #include "util/logging.h"
 
 namespace leveldb {
diff --git a/table/merger.cc b/table/merger.cc
index afa8b77..6ce06bb 100644
--- a/table/merger.cc
+++ b/table/merger.cc
@@ -4,8 +4,8 @@
 
 #include "table/merger.h"
 
-#include "include/comparator.h"
-#include "include/iterator.h"
+#include "leveldb/comparator.h"
+#include "leveldb/iterator.h"
 #include "table/iterator_wrapper.h"
 
 namespace leveldb {
diff --git a/table/table.cc b/table/table.cc
index bd0fbb5..9820753 100644
--- a/table/table.cc
+++ b/table/table.cc
@@ -2,10 +2,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/table.h"
+#include "leveldb/table.h"
 
-#include "include/cache.h"
-#include "include/env.h"
+#include "leveldb/cache.h"
+#include "leveldb/env.h"
 #include "table/block.h"
 #include "table/format.h"
 #include "table/two_level_iterator.h"
diff --git a/table/table_builder.cc b/table/table_builder.cc
index 3bc1b88..7ec7ad2 100644
--- a/table/table_builder.cc
+++ b/table/table_builder.cc
@@ -2,12 +2,12 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/table_builder.h"
+#include "leveldb/table_builder.h"
 
 #include 
 #include 
-#include "include/comparator.h"
-#include "include/env.h"
+#include "leveldb/comparator.h"
+#include "leveldb/env.h"
 #include "table/block_builder.h"
 #include "table/format.h"
 #include "util/coding.h"
diff --git a/table/table_test.cc b/table/table_test.cc
index d67c58b..e0c7134 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2,16 +2,16 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/table.h"
+#include "leveldb/table.h"
 
 #include 
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
-#include "include/db.h"
-#include "include/env.h"
-#include "include/iterator.h"
-#include "include/table_builder.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "leveldb/table_builder.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/format.h"
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
index 9b081f4..24a1241 100644
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
@@ -4,7 +4,7 @@
 
 #include "table/two_level_iterator.h"
 
-#include "include/table.h"
+#include "leveldb/table.h"
 #include "table/block.h"
 #include "table/format.h"
 #include "table/iterator_wrapper.h"
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
index 57e439c..5909e2b 100644
--- a/table/two_level_iterator.h
+++ b/table/two_level_iterator.h
@@ -5,7 +5,7 @@
 #ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_
 #define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_
 
-#include "include/iterator.h"
+#include "leveldb/iterator.h"
 
 namespace leveldb {
 
diff --git a/util/cache.cc b/util/cache.cc
index 958de66..d8a4426 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -12,7 +12,7 @@
 
 #include 
 
-#include "include/cache.h"
+#include "leveldb/cache.h"
 #include "port/port.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
diff --git a/util/cache_test.cc b/util/cache_test.cc
index 05de5d9..dbab988 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/cache.h"
+#include "leveldb/cache.h"
 
 #include 
 #include "util/coding.h"
diff --git a/util/coding.h b/util/coding.h
index a42e714..8755968 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -13,7 +13,7 @@
 #include 
 #include 
 #include 
-#include "include/slice.h"
+#include "leveldb/slice.h"
 #include "port/port.h"
 
 namespace leveldb {
diff --git a/util/comparator.cc b/util/comparator.cc
index dca3b4d..e2b27e3 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -3,8 +3,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include 
-#include "include/comparator.h"
-#include "include/slice.h"
+#include "leveldb/comparator.h"
+#include "leveldb/slice.h"
 #include "util/logging.h"
 
 namespace leveldb {
diff --git a/util/env.cc b/util/env.cc
index 3c2ca89..e5297e7 100644
--- a/util/env.cc
+++ b/util/env.cc
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/env.h"
+#include "leveldb/env.h"
 
 namespace leveldb {
 
diff --git a/util/env_chromium.cc b/util/env_chromium.cc
index 3cbf190..7edc7a9 100644
--- a/util/env_chromium.cc
+++ b/util/env_chromium.cc
@@ -19,8 +19,8 @@
 #include "base/threading/platform_thread.h"
 #include "base/threading/thread.h"
 #include "base/utf_string_conversions.h"
-#include "include/env.h"
-#include "include/slice.h"
+#include "leveldb/env.h"
+#include "leveldb/slice.h"
 #include "port/port.h"
 #include "util/logging.h"
 
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 5c58449..5cddb0c 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -19,8 +19,8 @@
 #if defined(LEVELDB_PLATFORM_ANDROID)
 #include 
 #endif
-#include "include/env.h"
-#include "include/slice.h"
+#include "leveldb/env.h"
+#include "leveldb/slice.h"
 #include "port/port.h"
 #include "util/logging.h"
 
diff --git a/util/env_test.cc b/util/env_test.cc
index 4d17564..3c253be 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/env.h"
+#include "leveldb/env.h"
 
 #include "port/port.h"
 #include "util/testharness.h"
diff --git a/util/logging.cc b/util/logging.cc
index 6b7c410..5c9bd4a 100644
--- a/util/logging.cc
+++ b/util/logging.cc
@@ -8,8 +8,8 @@
 #include 
 #include 
 #include 
-#include "include/env.h"
-#include "include/slice.h"
+#include "leveldb/env.h"
+#include "leveldb/slice.h"
 
 namespace leveldb {
 
diff --git a/util/options.cc b/util/options.cc
index 9fb7e0a..421608b 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -2,10 +2,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "include/options.h"
+#include "leveldb/options.h"
 
-#include "include/comparator.h"
-#include "include/env.h"
+#include "leveldb/comparator.h"
+#include "leveldb/env.h"
 
 namespace leveldb {
 
diff --git a/util/status.cc b/util/status.cc
index 2ed799d..d9b7195 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -4,7 +4,7 @@
 
 #include 
 #include "port/port.h"
-#include "include/status.h"
+#include "leveldb/status.h"
 
 namespace leveldb {
 
diff --git a/util/testharness.h b/util/testharness.h
index 93309dc..13ab914 100644
--- a/util/testharness.h
+++ b/util/testharness.h
@@ -8,8 +8,8 @@
 #include 
 #include 
 #include 
-#include "include/env.h"
-#include "include/slice.h"
+#include "leveldb/env.h"
+#include "leveldb/slice.h"
 #include "util/random.h"
 
 namespace leveldb {
diff --git a/util/testutil.h b/util/testutil.h
index 0e8a177..a150c1a 100644
--- a/util/testutil.h
+++ b/util/testutil.h
@@ -5,8 +5,8 @@
 #ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_
 #define STORAGE_LEVELDB_UTIL_TESTUTIL_H_
 
-#include "include/env.h"
-#include "include/slice.h"
+#include "leveldb/env.h"
+#include "leveldb/slice.h"
 #include "util/random.h"
 
 namespace leveldb {
-- 
cgit v1.2.3


From 1ca60b12c68a71aac695b15e329b2a76a63cbb0a Mon Sep 17 00:00:00 2001
From: "jorlow@chromium.org"
 
Date: Wed, 30 Mar 2011 18:49:03 +0000
Subject: Fix last commit

git-svn-id: http://leveldb.googlecode.com/svn/trunk@19 62dab493-f737-651d-591e-8d6aee1b9529
---
 include/cache.h                 |  99 --------------
 include/comparator.h            |  61 ---------
 include/db.h                    | 140 -------------------
 include/env.h                   | 290 ----------------------------------------
 include/iterator.h              |  95 -------------
 include/leveldb/cache.h         |  99 ++++++++++++++
 include/leveldb/comparator.h    |  61 +++++++++
 include/leveldb/db.h            | 140 +++++++++++++++++++
 include/leveldb/env.h           | 290 ++++++++++++++++++++++++++++++++++++++++
 include/leveldb/iterator.h      |  95 +++++++++++++
 include/leveldb/options.h       | 203 ++++++++++++++++++++++++++++
 include/leveldb/slice.h         | 104 ++++++++++++++
 include/leveldb/status.h        |  86 ++++++++++++
 include/leveldb/table.h         |  69 ++++++++++
 include/leveldb/table_builder.h |  86 ++++++++++++
 include/leveldb/write_batch.h   |  49 +++++++
 include/options.h               | 203 ----------------------------
 include/slice.h                 | 104 --------------
 include/status.h                |  86 ------------
 include/table.h                 |  69 ----------
 include/table_builder.h         |  86 ------------
 include/write_batch.h           |  49 -------
 22 files changed, 1282 insertions(+), 1282 deletions(-)
 delete mode 100644 include/cache.h
 delete mode 100644 include/comparator.h
 delete mode 100644 include/db.h
 delete mode 100644 include/env.h
 delete mode 100644 include/iterator.h
 create mode 100644 include/leveldb/cache.h
 create mode 100644 include/leveldb/comparator.h
 create mode 100644 include/leveldb/db.h
 create mode 100644 include/leveldb/env.h
 create mode 100644 include/leveldb/iterator.h
 create mode 100644 include/leveldb/options.h
 create mode 100644 include/leveldb/slice.h
 create mode 100644 include/leveldb/status.h
 create mode 100644 include/leveldb/table.h
 create mode 100644 include/leveldb/table_builder.h
 create mode 100644 include/leveldb/write_batch.h
 delete mode 100644 include/options.h
 delete mode 100644 include/slice.h
 delete mode 100644 include/status.h
 delete mode 100644 include/table.h
 delete mode 100644 include/table_builder.h
 delete mode 100644 include/write_batch.h

diff --git a/include/cache.h b/include/cache.h
deleted file mode 100644
index 6c98cb8..0000000
--- a/include/cache.h
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// A Cache is an interface that maps keys to values.  It has internal
-// synchronization and may be safely accessed concurrently from
-// multiple threads.  It may automatically evict entries to make room
-// for new entries.  Values have a specified charge against the cache
-// capacity.  For example, a cache where the values are variable
-// length strings, may use the length of the string as the charge for
-// the string.
-//
-// A builtin cache implementation with a least-recently-used eviction
-// policy is provided.  Clients may use their own implementations if
-// they want something more sophisticated (like scan-resistance, a
-// custom eviction policy, variable cache sizing, etc.)
-
-#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_
-#define STORAGE_LEVELDB_INCLUDE_CACHE_H_
-
-#include 
-#include "include/slice.h"
-
-namespace leveldb {
-
-class Cache;
-
-// Create a new cache with a fixed size capacity.  This implementation
-// of Cache uses a least-recently-used eviction policy.
-extern Cache* NewLRUCache(size_t capacity);
-
-class Cache {
- public:
-  Cache() { }
-
-  // Destroys all existing entries by calling the "deleter"
-  // function that was passed to the constructor.
-  virtual ~Cache();
-
-  // Opaque handle to an entry stored in the cache.
-  struct Handle { };
-
-  // Insert a mapping from key->value into the cache and assign it
-  // the specified charge against the total cache capacity.
-  //
-  // Returns a handle that corresponds to the mapping.  The caller
-  // must call this->Release(handle) when the returned mapping is no
-  // longer needed.
-  //
-  // When the inserted entry is no longer needed, the key and
-  // value will be passed to "deleter".
-  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
-                         void (*deleter)(const Slice& key, void* value)) = 0;
-
-  // If the cache has no mapping for "key", returns NULL.
-  //
-  // Else return a handle that corresponds to the mapping.  The caller
-  // must call this->Release(handle) when the returned mapping is no
-  // longer needed.
-  virtual Handle* Lookup(const Slice& key) = 0;
-
-  // Release a mapping returned by a previous Lookup().
-  // REQUIRES: handle must not have been released yet.
-  // REQUIRES: handle must have been returned by a method on *this.
-  virtual void Release(Handle* handle) = 0;
-
-  // Return the value encapsulated in a handle returned by a
-  // successful Lookup().
-  // REQUIRES: handle must not have been released yet.
-  // REQUIRES: handle must have been returned by a method on *this.
-  virtual void* Value(Handle* handle) = 0;
-
-  // If the cache contains entry for key, erase it.  Note that the
-  // underlying entry will be kept around until all existing handles
-  // to it have been released.
-  virtual void Erase(const Slice& key) = 0;
-
-  // Return a new numeric id.  May be used by multiple clients who are
-  // sharing the same cache to partition the key space.  Typically the
-  // client will allocate a new id at startup and prepend the id to
-  // its cache keys.
-  virtual uint64_t NewId() = 0;
-
- private:
-  void LRU_Remove(Handle* e);
-  void LRU_Append(Handle* e);
-  void Unref(Handle* e);
-
-  struct Rep;
-  Rep* rep_;
-
-  // No copying allowed
-  Cache(const Cache&);
-  void operator=(const Cache&);
-};
-
-}
-
-#endif  // STORAGE_LEVELDB_UTIL_CACHE_H_
diff --git a/include/comparator.h b/include/comparator.h
deleted file mode 100644
index 4e00e4d..0000000
--- a/include/comparator.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
-#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
-
-#include 
-
-namespace leveldb {
-
-class Slice;
-
-// A Comparator object provides a total order across slices that are
-// used as keys in an sstable or a database.
-class Comparator {
- public:
-  virtual ~Comparator();
-
-  // Three-way comparison.  Returns value:
-  //   < 0 iff "a" < "b",
-  //   == 0 iff "a" == "b",
-  //   > 0 iff "a" > "b"
-  virtual int Compare(const Slice& a, const Slice& b) const = 0;
-
-  // The name of the comparator.  Used to check for comparator
-  // mismatches (i.e., a DB created with one comparator is
-  // accessed using a different comparator.
-  //
-  // The client of this package should switch to a new name whenever
-  // the comparator implementation changes in a way that will cause
-  // the relative ordering of any two keys to change.
-  //
-  // Names starting with "leveldb." are reserved and should not be used
-  // by any clients of this package.
-  virtual const char* Name() const = 0;
-
-  // Advanced functions: these are used to reduce the space requirements
-  // for internal data structures like index blocks.
-
-  // If *start < limit, changes *start to a short string in [start,limit).
-  // Simple comparator implementations may return with *start unchanged,
-  // i.e., an implementation of this method that does nothing is correct.
-  virtual void FindShortestSeparator(
-      std::string* start,
-      const Slice& limit) const = 0;
-
-  // Changes *key to a short string >= *key.
-  // Simple comparator implementations may return with *key unchanged,
-  // i.e., an implementation of this method that does nothing is correct.
-  virtual void FindShortSuccessor(std::string* key) const = 0;
-};
-
-// Return a builtin comparator that uses lexicographic byte-wise
-// ordering.  The result remains the property of this module and
-// must not be deleted.
-extern const Comparator* BytewiseComparator();
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
diff --git a/include/db.h b/include/db.h
deleted file mode 100644
index 75be1ca..0000000
--- a/include/db.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_
-#define STORAGE_LEVELDB_INCLUDE_DB_H_
-
-#include 
-#include 
-#include "include/iterator.h"
-#include "include/options.h"
-
-namespace leveldb {
-
-static const int kMajorVersion = 1;
-static const int kMinorVersion = 0;
-
-struct Options;
-struct ReadOptions;
-struct WriteOptions;
-
-class Snapshot;
-class WriteBatch;
-
-// Some internal types.  Clients should ignore.
-class WriteBatchInternal;
-
-struct Range {
-  Slice start;
-  Slice limit;
-
-  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
-};
-
-// A DB is a persistent ordered map from keys to values.
-class DB {
- public:
-  // Open the database with the specified "name".
-  // Stores a pointer to a heap-allocated database in *dbptr and returns
-  // OK on success.
-  // Stores NULL in *dbptr and returns a non-OK status on error.
-  // Caller should delete *dbptr when it is no longer needed.
-  static Status Open(const Options& options,
-                     const std::string& name,
-                     DB** dbptr);
-
-  DB() { }
-  virtual ~DB();
-
-  // Set the database entry for "key" to "value".  Returns OK on success,
-  // and a non-OK status on error.
-  // Note: consider setting options.sync = false.
-  virtual Status Put(const WriteOptions& options,
-                     const Slice& key,
-                     const Slice& value) = 0;
-
-  // Remove the database entry (if any) for "key".  Returns OK on
-  // success, and a non-OK status on error.  It is not an error if "key"
-  // did not exist in the database.
-  // Note: consider setting options.sync = false.
-  virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
-
-  // Apply the specified updates to the database.
-  // Returns OK on success, non-OK on failure.
-  // Note: consider setting options.sync = false.
-  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
-
-  // If the database contains an entry for "key" store the
-  // corresponding value in *value and return OK.
-  //
-  // If there is no entry for "key" leave *value unchanged and return
-  // a status for which Status::IsNotFound() returns true.
-  //
-  // May return some other Status on an error.
-  virtual Status Get(const ReadOptions& options,
-                     const Slice& key, std::string* value) = 0;
-
-  // Return a heap-allocated iterator over the contents of the database.
-  // The result of NewIterator() is initially invalid (caller must
-  // call one of the Seek methods on the iterator before using it).
-  //
-  // Caller should delete the iterator when it is no longer needed.
-  // The returned iterator should be deleted before this db is deleted.
-  virtual Iterator* NewIterator(const ReadOptions& options) = 0;
-
-  // Return a handle to the current DB state.  Iterators created with
-  // this handle will all observe a stable snapshot of the current DB
-  // state.  The caller must call ReleaseSnapshot(result) when the
-  // snapshot is no longer needed.
-  virtual const Snapshot* GetSnapshot() = 0;
-
-  // Release a previously acquired snapshot.  The caller must not
-  // use "snapshot" after this call.
-  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
-
-  // DB implementations can export properties about their state
-  // via this method.  If "property" is a valid property understood by this
-  // DB implementation, fills "*value" with its current value and returns
-  // true.  Otherwise returns false.
-  //
-  //
-  // Valid property names include:
-  //
-  //  "leveldb.num-files-at-level" - return the number of files at level ,
-  //     where  is an ASCII representation of a level number (e.g. "0").
-  virtual bool GetProperty(const Slice& property, uint64_t* value) = 0;
-
-  // For each i in [0,n-1], store in "sizes[i]", the approximate
-  // file system space used by keys in "[range[i].start .. range[i].limit)".
-  //
-  // Note that the returned sizes measure file system space usage, so
-  // if the user data compresses by a factor of ten, the returned
-  // sizes will be one-tenth the size of the corresponding user data size.
-  //
-  // The results may not include the sizes of recently written data.
-  virtual void GetApproximateSizes(const Range* range, int n,
-                                   uint64_t* sizes) = 0;
-
-  // Possible extensions:
-  // (1) Add a method to compact a range of keys
-
- private:
-  // No copying allowed
-  DB(const DB&);
-  void operator=(const DB&);
-};
-
-// Destroy the contents of the specified database.
-// Be very careful using this method.
-Status DestroyDB(const std::string& name, const Options& options);
-
-// If a DB cannot be opened, you may attempt to call this method to
-// resurrect as much of the contents of the database as possible.
-// Some data may be lost, so be careful when calling this function
-// on a database that contains important information.
-Status RepairDB(const std::string& dbname, const Options& options);
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_DB_H_
diff --git a/include/env.h b/include/env.h
deleted file mode 100644
index 14ddf29..0000000
--- a/include/env.h
+++ /dev/null
@@ -1,290 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// An Env is an interface used by the leveldb implementation to access
-// operating system functionality like the filesystem etc.  Callers
-// may wish to provide a custom Env object when opening a database to
-// get fine gain control; e.g., to rate limit file system operations.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
-#define STORAGE_LEVELDB_INCLUDE_ENV_H_
-
-#include 
-#include 
-#include 
-#include 
-#include "include/status.h"
-
-namespace leveldb {
-
-class FileLock;
-class RandomAccessFile;
-class SequentialFile;
-class Slice;
-class WritableFile;
-
-class Env {
- public:
-  Env() { }
-  virtual ~Env();
-
-  // Return a default environment suitable for the current operating
-  // system.  Sophisticated users may wish to provide their own Env
-  // implementation instead of relying on this default environment.
-  //
-  // The result of Default() belongs to leveldb and must never be deleted.
-  static Env* Default();
-
-  // Create a brand new sequentially-readable file with the specified name.
-  // On success, stores a pointer to the new file in *result and returns OK.
-  // On failure stores NULL in *result and returns non-OK.  If the file does
-  // not exist, returns a non-OK status.
-  //
-  // The returned file will only be accessed by one thread at a time.
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   SequentialFile** result) = 0;
-
-  // Create a brand new random access read-only file with the
-  // specified name.  On success, stores a pointer to the new file in
-  // *result and returns OK.  On failure stores NULL in *result and
-  // returns non-OK.  If the file does not exist, returns a non-OK
-  // status.
-  //
-  // The returned file may be concurrently accessed by multiple threads.
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     RandomAccessFile** result) = 0;
-
-  // Create an object that writes to a new file with the specified
-  // name.  Deletes any existing file with the same name and creates a
-  // new file.  On success, stores a pointer to the new file in
-  // *result and returns OK.  On failure stores NULL in *result and
-  // returns non-OK.
-  //
-  // The returned file will only be accessed by one thread at a time.
-  virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result) = 0;
-
-  // Returns true iff the named file exists.
-  virtual bool FileExists(const std::string& fname) = 0;
-
-  // Store in *result the names of the children of the specified directory.
-  // The names are relative to "dir".
-  // Original contents of *results are dropped.
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector* result) = 0;
-
-  // Delete the named file.
-  virtual Status DeleteFile(const std::string& fname) = 0;
-
-  // Create the specified directory.
-  virtual Status CreateDir(const std::string& dirname) = 0;
-
-  // Delete the specified directory.
-  virtual Status DeleteDir(const std::string& dirname) = 0;
-
-  // Store the size of fname in *file_size.
-  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
-
-  // Rename file src to target.
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& target) = 0;
-
-  // Lock the specified file.  Used to prevent concurrent access to
-  // the same db by multiple processes.  On failure, stores NULL in
-  // *lock and returns non-OK.
-  //
-  // On success, stores a pointer to the object that represents the
-  // acquired lock in *lock and returns OK.  The caller should call
-  // UnlockFile(*lock) to release the lock.  If the process exits,
-  // the lock will be automatically released.
-  //
-  // If somebody else already holds the lock, finishes immediately
-  // with a failure.  I.e., this call does not wait for existing locks
-  // to go away.
-  //
-  // May create the named file if it does not already exist.
-  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
-
-  // Release the lock acquired by a previous successful call to LockFile.
-  // REQUIRES: lock was returned by a successful LockFile() call
-  // REQUIRES: lock has not already been unlocked.
-  virtual Status UnlockFile(FileLock* lock) = 0;
-
-  // Arrange to run "(*function)(arg)" once in a background thread.
-  //
-  // "function" may run in an unspecified thread.  Multiple functions
-  // added to the same Env may run concurrently in different threads.
-  // I.e., the caller may not assume that background work items are
-  // serialized.
-  virtual void Schedule(
-      void (*function)(void* arg),
-      void* arg) = 0;
-
-  // Start a new thread, invoking "function(arg)" within the new thread.
-  // When "function(arg)" returns, the thread will be destroyed.
-  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
-
-  // *path is set to a temporary directory that can be used for testing. It may
-  // or many not have just been created. The directory may or may not differ
-  // between runs of the same process, but subsequent calls will return the
-  // same directory.
-  virtual Status GetTestDirectory(std::string* path) = 0;
-
-  // Write an entry to the log file with the specified format.
-  virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0;
-
-  // Returns the number of micro-seconds since some fixed point in time. Only
-  // useful for computing deltas of time.
-  virtual uint64_t NowMicros() = 0;
-
-  // Sleep/delay the thread for the perscribed number of micro-seconds.
-  virtual void SleepForMicroseconds(int micros) = 0;
-
- private:
-  // No copying allowed
-  Env(const Env&);
-  void operator=(const Env&);
-};
-
-// A file abstraction for reading sequentially through a file
-class SequentialFile {
- public:
-  SequentialFile() { }
-  virtual ~SequentialFile();
-
-  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
-  // written by this routine.  Sets "*result" to the data that was
-  // read (including if fewer than "n" bytes were successfully read).
-  // If an error was encountered, returns a non-OK status.
-  //
-  // REQUIRES: External synchronization
-  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
-};
-
-// A file abstraction for randomly reading the contents of a file.
-class RandomAccessFile {
- public:
-  RandomAccessFile() { }
-  virtual ~RandomAccessFile();
-
-  // Read up to "n" bytes from the file starting at "offset".
-  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
-  // to the data that was read (including if fewer than "n" bytes were
-  // successfully read).  If an error was encountered, returns a
-  // non-OK status.
-  //
-  // Safe for concurrent use by multiple threads.
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const = 0;
-};
-
-// A file abstraction for sequential writing.  The implementation
-// must provide buffering since callers may append small fragments
-// at a time to the file.
-class WritableFile {
- public:
-  WritableFile() { }
-  virtual ~WritableFile();
-
-  virtual Status Append(const Slice& data) = 0;
-  virtual Status Close() = 0;
-  virtual Status Flush() = 0;
-  virtual Status Sync() = 0;
-
- private:
-  // No copying allowed
-  WritableFile(const WritableFile&);
-  void operator=(const WritableFile&);
-};
-
-// Identifies a locked file.
-class FileLock {
- public:
-  FileLock() { }
-  virtual ~FileLock();
- private:
-  // No copying allowed
-  FileLock(const FileLock&);
-  void operator=(const FileLock&);
-};
-
-// Log the specified data to *info_log if info_log is non-NULL.
-extern void Log(Env* env, WritableFile* info_log, const char* format, ...)
-#   if defined(__GNUC__) || defined(__clang__)
-    __attribute__((__format__ (__printf__, 3, 4)))
-#   endif
-    ;
-
-// A utility routine: write "data" to the named file.
-extern Status WriteStringToFile(Env* env, const Slice& data,
-                                const std::string& fname);
-
-// A utility routine: read contents of named file into *data
-extern Status ReadFileToString(Env* env, const std::string& fname,
-                               std::string* data);
-
-// An implementation of Env that forwards all calls to another Env.
-// May be useful to clients who wish to override just part of the
-// functionality of another Env.
-class EnvWrapper : public Env {
- public:
-  // Initialize an EnvWrapper that delegates all calls to *target
-  explicit EnvWrapper(Env* target) : target_(target) { }
-  virtual ~EnvWrapper();
-
-  // Return the target to which this Env forwards all calls
-  Env* target() const { return target_; }
-
-  // The following text is boilerplate that forwards all methods to target()
-  Status NewSequentialFile(const std::string& f, SequentialFile** r) {
-    return target_->NewSequentialFile(f, r);
-  }
-  Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
-    return target_->NewRandomAccessFile(f, r);
-  }
-  Status NewWritableFile(const std::string& f, WritableFile** r) {
-    return target_->NewWritableFile(f, r);
-  }
-  bool FileExists(const std::string& f) { return target_->FileExists(f); }
-  Status GetChildren(const std::string& dir, std::vector* r) {
-    return target_->GetChildren(dir, r);
-  }
-  Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
-  Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
-  Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
-  Status GetFileSize(const std::string& f, uint64_t* s) {
-    return target_->GetFileSize(f, s);
-  }
-  Status RenameFile(const std::string& s, const std::string& t) {
-    return target_->RenameFile(s, t);
-  }
-  Status LockFile(const std::string& f, FileLock** l) {
-    return target_->LockFile(f, l);
-  }
-  Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
-  void Schedule(void (*f)(void*), void* a) {
-    return target_->Schedule(f, a);
-  }
-  void StartThread(void (*f)(void*), void* a) {
-    return target_->StartThread(f, a);
-  }
-  virtual Status GetTestDirectory(std::string* path) {
-    return target_->GetTestDirectory(path);
-  }
-  virtual void Logv(WritableFile* log, const char* format, va_list ap) {
-    return target_->Logv(log, format, ap);
-  }
-  uint64_t NowMicros() {
-    return target_->NowMicros();
-  }
-  void SleepForMicroseconds(int micros) {
-    target_->SleepForMicroseconds(micros);
-  }
- private:
-  Env* target_;
-};
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_ENV_H_
diff --git a/include/iterator.h b/include/iterator.h
deleted file mode 100644
index b0872a3..0000000
--- a/include/iterator.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// An iterator yields a sequence of key/value pairs from a source.
-// The following class defines the interface.  Multiple implementations
-// are provided by this library.  In particular, iterators are provided
-// to access the contents of a Table or a DB.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
-#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
-
-#include "include/slice.h"
-#include "include/status.h"
-
-namespace leveldb {
-
-class Iterator {
- public:
-  Iterator();
-  virtual ~Iterator();
-
-  // An iterator is either positioned at a key/value pair, or
-  // not valid.  This method returns true iff the iterator is valid.
-  virtual bool Valid() const = 0;
-
-  // Position at the first key in the source.  The iterator is Valid()
-  // after this call iff the source is not empty.
-  virtual void SeekToFirst() = 0;
-
-  // Position at the last key in the source.  The iterator is
-  // Valid() after this call iff the source is not empty.
-  virtual void SeekToLast() = 0;
-
-  // Position at the first key in the source that at or past target
-  // The iterator is Valid() after this call iff the source contains
-  // an entry that comes at or past target.
-  virtual void Seek(const Slice& target) = 0;
-
-  // Moves to the next entry in the source.  After this call, Valid() is
-  // true iff the iterator was not positioned at the last entry in the source.
-  // REQUIRES: Valid()
-  virtual void Next() = 0;
-
-  // Moves to the previous entry in the source.  After this call, Valid() is
-  // true iff the iterator was not positioned at the first entry in source.
-  // REQUIRES: Valid()
-  virtual void Prev() = 0;
-
-  // Return the key for the current entry.  The underlying storage for
-  // the returned slice is valid only until the next modification of
-  // the iterator.
-  // REQUIRES: Valid()
-  virtual Slice key() const = 0;
-
-  // Return the value for the current entry.  The underlying storage for
-  // the returned slice is valid only until the next modification of
-  // the iterator.
-  // REQUIRES: !AtEnd() && !AtStart()
-  virtual Slice value() const = 0;
-
-  // If an error has occurred, return it.  Else return an ok status.
-  virtual Status status() const = 0;
-
-  // Clients are allowed to register function/arg1/arg2 triples that
-  // will be invoked when this iterator is destroyed.
-  //
-  // Note that unlike all of the preceding methods, this method is
-  // not abstract and therefore clients should not override it.
-  typedef void (*CleanupFunction)(void* arg1, void* arg2);
-  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
-
- private:
-  struct Cleanup {
-    CleanupFunction function;
-    void* arg1;
-    void* arg2;
-    Cleanup* next;
-  };
-  Cleanup cleanup_;
-
-  // No copying allowed
-  Iterator(const Iterator&);
-  void operator=(const Iterator&);
-};
-
-// Return an empty iterator (yields nothing).
-extern Iterator* NewEmptyIterator();
-
-// Return an empty iterator with the specified status.
-extern Iterator* NewErrorIterator(const Status& status);
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h
new file mode 100644
index 0000000..79196d1
--- /dev/null
+++ b/include/leveldb/cache.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_
+#define STORAGE_LEVELDB_INCLUDE_CACHE_H_
+
+#include 
+#include "leveldb/slice.h"
+
+namespace leveldb {
+
+class Cache;
+
+// Create a new cache with a fixed size capacity.  This implementation
+// of Cache uses a least-recently-used eviction policy.
+extern Cache* NewLRUCache(size_t capacity);
+
+class Cache {
+ public:
+  Cache() { }
+
+  // Destroys all existing entries by calling the "deleter"
+  // function that was passed to the constructor.
+  virtual ~Cache();
+
+  // Opaque handle to an entry stored in the cache.
+  struct Handle { };
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  //
+  // Returns a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) = 0;
+
+  // If the cache has no mapping for "key", returns NULL.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  virtual Handle* Lookup(const Slice& key) = 0;
+
+  // Release a mapping returned by a previous Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void Release(Handle* handle) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains entry for key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharing the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+ private:
+  void LRU_Remove(Handle* e);
+  void LRU_Append(Handle* e);
+  void Unref(Handle* e);
+
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  Cache(const Cache&);
+  void operator=(const Cache&);
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_UTIL_CACHE_H_
diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h
new file mode 100644
index 0000000..4e00e4d
--- /dev/null
+++ b/include/leveldb/comparator.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
+#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
+
+#include 
+
+namespace leveldb {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.
+class Comparator {
+ public:
+  virtual ~Comparator();
+
+  // Three-way comparison.  Returns value:
+  //   < 0 iff "a" < "b",
+  //   == 0 iff "a" == "b",
+  //   > 0 iff "a" > "b"
+  virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+  // The name of the comparator.  Used to check for comparator
+  // mismatches (i.e., a DB created with one comparator is
+  // accessed using a different comparator.
+  //
+  // The client of this package should switch to a new name whenever
+  // the comparator implementation changes in a way that will cause
+  // the relative ordering of any two keys to change.
+  //
+  // Names starting with "leveldb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Advanced functions: these are used to reduce the space requirements
+  // for internal data structures like index blocks.
+
+  // If *start < limit, changes *start to a short string in [start,limit).
+  // Simple comparator implementations may return with *start unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const = 0;
+
+  // Changes *key to a short string >= *key.
+  // Simple comparator implementations may return with *key unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortSuccessor(std::string* key) const = 0;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering.  The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
diff --git a/include/leveldb/db.h b/include/leveldb/db.h
new file mode 100644
index 0000000..74d50d3
--- /dev/null
+++ b/include/leveldb/db.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_
+#define STORAGE_LEVELDB_INCLUDE_DB_H_
+
+#include 
+#include 
+#include "leveldb/iterator.h"
+#include "leveldb/options.h"
+
+namespace leveldb {
+
+static const int kMajorVersion = 1;
+static const int kMinorVersion = 0;
+
+struct Options;
+struct ReadOptions;
+struct WriteOptions;
+
+class Snapshot;
+class WriteBatch;
+
+// Some internal types.  Clients should ignore.
+class WriteBatchInternal;
+
+struct Range {
+  Slice start;
+  Slice limit;
+
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+};
+
+// A DB is a persistent ordered map from keys to values.
+class DB {
+ public:
+  // Open the database with the specified "name".
+  // Stores a pointer to a heap-allocated database in *dbptr and returns
+  // OK on success.
+  // Stores NULL in *dbptr and returns a non-OK status on error.
+  // Caller should delete *dbptr when it is no longer needed.
+  static Status Open(const Options& options,
+                     const std::string& name,
+                     DB** dbptr);
+
+  DB() { }
+  virtual ~DB();
+
+  // Set the database entry for "key" to "value".  Returns OK on success,
+  // and a non-OK status on error.
+  // Note: consider setting options.sync = false.
+  virtual Status Put(const WriteOptions& options,
+                     const Slice& key,
+                     const Slice& value) = 0;
+
+  // Remove the database entry (if any) for "key".  Returns OK on
+  // success, and a non-OK status on error.  It is not an error if "key"
+  // did not exist in the database.
+  // Note: consider setting options.sync = false.
+  virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
+
+  // Apply the specified updates to the database.
+  // Returns OK on success, non-OK on failure.
+  // Note: consider setting options.sync = false.
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+  // If the database contains an entry for "key" store the
+  // corresponding value in *value and return OK.
+  //
+  // If there is no entry for "key" leave *value unchanged and return
+  // a status for which Status::IsNotFound() returns true.
+  //
+  // May return some other Status on an error.
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key, std::string* value) = 0;
+
+  // Return a heap-allocated iterator over the contents of the database.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // Caller should delete the iterator when it is no longer needed.
+  // The returned iterator should be deleted before this db is deleted.
+  virtual Iterator* NewIterator(const ReadOptions& options) = 0;
+
+  // Return a handle to the current DB state.  Iterators created with
+  // this handle will all observe a stable snapshot of the current DB
+  // state.  The caller must call ReleaseSnapshot(result) when the
+  // snapshot is no longer needed.
+  virtual const Snapshot* GetSnapshot() = 0;
+
+  // Release a previously acquired snapshot.  The caller must not
+  // use "snapshot" after this call.
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+  // DB implementations can export properties about their state
+  // via this method.  If "property" is a valid property understood by this
+  // DB implementation, fills "*value" with its current value and returns
+  // true.  Otherwise returns false.
+  //
+  //
+  // Valid property names include:
+  //
+  //  "leveldb.num-files-at-level" - return the number of files at level ,
+  //     where  is an ASCII representation of a level number (e.g. "0").
+  virtual bool GetProperty(const Slice& property, uint64_t* value) = 0;
+
+  // For each i in [0,n-1], store in "sizes[i]", the approximate
+  // file system space used by keys in "[range[i].start .. range[i].limit)".
+  //
+  // Note that the returned sizes measure file system space usage, so
+  // if the user data compresses by a factor of ten, the returned
+  // sizes will be one-tenth the size of the corresponding user data size.
+  //
+  // The results may not include the sizes of recently written data.
+  virtual void GetApproximateSizes(const Range* range, int n,
+                                   uint64_t* sizes) = 0;
+
+  // Possible extensions:
+  // (1) Add a method to compact a range of keys
+
+ private:
+  // No copying allowed
+  DB(const DB&);
+  void operator=(const DB&);
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options);
+
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+Status RepairDB(const std::string& dbname, const Options& options);
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_DB_H_
diff --git a/include/leveldb/env.h b/include/leveldb/env.h
new file mode 100644
index 0000000..4b6e712
--- /dev/null
+++ b/include/leveldb/env.h
@@ -0,0 +1,290 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the leveldb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
+#define STORAGE_LEVELDB_INCLUDE_ENV_H_
+
+#include 
+#include 
+#include 
+#include 
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class FileLock;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+
+class Env {
+ public:
+  Env() { }
+  virtual ~Env();
+
+  // Return a default environment suitable for the current operating
+  // system.  Sophisticated users may wish to provide their own Env
+  // implementation instead of relying on this default environment.
+  //
+  // The result of Default() belongs to leveldb and must never be deleted.
+  static Env* Default();
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores NULL in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   SequentialFile** result) = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores NULL in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     RandomAccessFile** result) = 0;
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores NULL in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWritableFile(const std::string& fname,
+                                 WritableFile** result) = 0;
+
+  // Returns true iff the named file exists.
+  virtual bool FileExists(const std::string& fname) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector* result) = 0;
+
+  // Delete the named file.
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Create the specified directory.
+  virtual Status CreateDir(const std::string& dirname) = 0;
+
+  // Delete the specified directory.
+  virtual Status DeleteDir(const std::string& dirname) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+  // Rename file src to target.
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) = 0;
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores NULL in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual Status UnlockFile(FileLock* lock) = 0;
+
+  // Arrange to run "(*function)(arg)" once in a background thread.
+  //
+  // "function" may run in an unspecified thread.  Multiple functions
+  // added to the same Env may run concurrently in different threads.
+  // I.e., the caller may not assume that background work items are
+  // serialized.
+  virtual void Schedule(
+      void (*function)(void* arg),
+      void* arg) = 0;
+
+  // Start a new thread, invoking "function(arg)" within the new thread.
+  // When "function(arg)" returns, the thread will be destroyed.
+  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual Status GetTestDirectory(std::string* path) = 0;
+
+  // Write an entry to the log file with the specified format.
+  virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0;
+
+  // Returns the number of micro-seconds since some fixed point in time. Only
+  // useful for computing deltas of time.
+  virtual uint64_t NowMicros() = 0;
+
+  // Sleep/delay the thread for the perscribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+ private:
+  // No copying allowed
+  Env(const Env&);
+  void operator=(const Env&);
+};
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+  SequentialFile() { }
+  virtual ~SequentialFile();
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() { }
+  virtual ~RandomAccessFile();
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  If an error was encountered, returns a
+  // non-OK status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+  WritableFile() { }
+  virtual ~WritableFile();
+
+  virtual Status Append(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual Status Flush() = 0;
+  virtual Status Sync() = 0;
+
+ private:
+  // No copying allowed
+  WritableFile(const WritableFile&);
+  void operator=(const WritableFile&);
+};
+
+// Identifies a locked file.
+class FileLock {
+ public:
+  FileLock() { }
+  virtual ~FileLock();
+ private:
+  // No copying allowed
+  FileLock(const FileLock&);
+  void operator=(const FileLock&);
+};
+
+// Log the specified data to *info_log if info_log is non-NULL.
+extern void Log(Env* env, WritableFile* info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 3, 4)))
+#   endif
+    ;
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+                                const std::string& fname);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+                               std::string* data);
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *target
+  explicit EnvWrapper(Env* target) : target_(target) { }
+  virtual ~EnvWrapper();
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f, SequentialFile** r) {
+    return target_->NewSequentialFile(f, r);
+  }
+  Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
+    return target_->NewRandomAccessFile(f, r);
+  }
+  Status NewWritableFile(const std::string& f, WritableFile** r) {
+    return target_->NewWritableFile(f, r);
+  }
+  bool FileExists(const std::string& f) { return target_->FileExists(f); }
+  Status GetChildren(const std::string& dir, std::vector* r) {
+    return target_->GetChildren(dir, r);
+  }
+  Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
+  Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
+  Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
+  Status GetFileSize(const std::string& f, uint64_t* s) {
+    return target_->GetFileSize(f, s);
+  }
+  Status RenameFile(const std::string& s, const std::string& t) {
+    return target_->RenameFile(s, t);
+  }
+  Status LockFile(const std::string& f, FileLock** l) {
+    return target_->LockFile(f, l);
+  }
+  Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
+  void Schedule(void (*f)(void*), void* a) {
+    return target_->Schedule(f, a);
+  }
+  void StartThread(void (*f)(void*), void* a) {
+    return target_->StartThread(f, a);
+  }
+  virtual Status GetTestDirectory(std::string* path) {
+    return target_->GetTestDirectory(path);
+  }
+  virtual void Logv(WritableFile* log, const char* format, va_list ap) {
+    return target_->Logv(log, format, ap);
+  }
+  uint64_t NowMicros() {
+    return target_->NowMicros();
+  }
+  void SleepForMicroseconds(int micros) {
+    target_->SleepForMicroseconds(micros);
+  }
+ private:
+  Env* target_;
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_ENV_H_
diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h
new file mode 100644
index 0000000..1866fb5
--- /dev/null
+++ b/include/leveldb/iterator.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
+#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
+
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class Iterator {
+ public:
+  Iterator();
+  virtual ~Iterator();
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: !AtEnd() && !AtStart()
+  virtual Slice value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  virtual Status status() const = 0;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  typedef void (*CleanupFunction)(void* arg1, void* arg2);
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+
+  // No copying allowed
+  Iterator(const Iterator&);
+  void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
diff --git a/include/leveldb/options.h b/include/leveldb/options.h
new file mode 100644
index 0000000..0b65624
--- /dev/null
+++ b/include/leveldb/options.h
@@ -0,0 +1,203 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
+#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
+
+#include 
+
+namespace leveldb {
+
+class Cache;
+class Comparator;
+class Env;
+class Snapshot;
+class WritableFile;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression     = 0x0,
+  kSnappyCompression = 0x1,
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options {
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // Default: false
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be to written to info_log if it is non-NULL, or to a file stored
+  // in the same directory as the DB contents if info_log is NULL.
+  // Default: NULL
+  WritableFile* info_log;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory before converting to an
+  // on-disk file.
+  //
+  // Some DB operations may encounter a delay proportional to the size
+  // of this parameter.  Therefore we recommend against increasing
+  // this parameter unless you are willing to live with an occasional
+  // slow operation in exchange for faster bulk loading throughput.
+  //
+  // Default: 1MB
+  size_t write_buffer_size;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set (budget
+  // one open file per 2MB of working set).
+  //
+  // Default: 1000
+  int max_open_files;
+
+  // Handle values larger than "large_value_threshold" bytes
+  // specially, by writing them into their own files (to avoid
+  // compaction overhead) and doing content-based elimination of
+  // duplicate values to save space.
+  //
+  // We recommend against changing this value.
+  //
+  // Default: 64K
+  size_t large_value_threshold;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // Use the specified cache for blocks (if non-NULL).
+  // Default: NULL
+  Cache* block_cache;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 8K
+  int block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // Create an Options object with default values for all fields.
+  Options();
+};
+
+// Options that control read operations
+struct ReadOptions {
+  // If true, all data read from underlying storage will be
+  // verified against corresponding checksums.
+  // Default: false
+  bool verify_checksums;
+
+  // Should the data read for this iteration be cached in memory?
+  // Callers may wish to set this field to false for bulk scans.
+  // Default: true
+  bool fill_cache;
+
+  // If "snapshot" is non-NULL, read as of the supplied snapshot
+  // (which must belong to the DB that is being read and which must
+  // not have been released).  If "snapshot" is NULL, use an impliicit
+  // snapshot of the state at the beginning of this read operation.
+  // Default: NULL
+  const Snapshot* snapshot;
+
+  ReadOptions()
+      : verify_checksums(false),
+        fill_cache(true),
+        snapshot(NULL) {
+  }
+};
+
+// Options that control write operations
+struct WriteOptions {
+  // If true, the write will be flushed from the operating system
+  // buffer cache (by calling WritableFile::Sync()) before the write
+  // is considered complete.  If this flag is true, writes will be
+  // slower.
+  //
+  // If this flag is false, and the machine crashes, some recent
+  // writes may be lost.  Note that if it is just the process that
+  // crashes (i.e., the machine does not reboot), no writes will be
+  // lost even if sync==false.
+  //
+  // Default: true
+  bool sync;
+
+  // If "post_write_snapshot" is non-NULL, and the write succeeds,
+  // *post_write_snapshot will be modified to point to a snapshot of
+  // the DB state immediately after this write.  The caller must call
+  // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the
+  // snapshot is no longer needed.
+  //
+  // If "post_write_snapshot" is non-NULL, and the write fails,
+  // *post_write_snapshot will be set to NULL.
+  //
+  // Default: NULL
+  const Snapshot** post_write_snapshot;
+
+  WriteOptions()
+      : sync(true),
+        post_write_snapshot(NULL) {
+  }
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h
new file mode 100644
index 0000000..62cb894
--- /dev/null
+++ b/include/leveldb/slice.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size.  The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
+#define STORAGE_LEVELDB_INCLUDE_SLICE_H_
+
+#include 
+#include 
+#include 
+#include 
+
+namespace leveldb {
+
+class Slice {
+ public:
+  // Create an empty slice.
+  Slice() : data_(""), size_(0) { }
+
+  // Create a slice that refers to data[0,n-1].
+  Slice(const char* data, size_t n) : data_(data), size_(n) { }
+
+  // Create a slice that refers to the contents of "s"
+  Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
+
+  // Create a slice that refers to s[0,strlen(s)-1]
+  Slice(const char* s) : data_(s), size_(strlen(s)) { }
+
+  // Return a pointer to the beginning of the referenced data
+  const char* data() const { return data_; }
+
+  // Return the length (in bytes) of the referenced data
+  size_t size() const { return size_; }
+
+  // Return true iff the length of the referenced data is zero
+  bool empty() const { return size_ == 0; }
+
+  // Return the ith byte in the referenced data.
+  // REQUIRES: n < size()
+  char operator[](size_t n) const {
+    assert(n < size());
+    return data_[n];
+  }
+
+  // Change this slice to refer to an empty array
+  void clear() { data_ = ""; size_ = 0; }
+
+  // Drop the first "n" bytes from this slice.
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    data_ += n;
+    size_ -= n;
+  }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString() const { return std::string(data_, size_); }
+
+  // Three-way comparison.  Returns value:
+  //   <  0 iff "*this" <  "b",
+  //   == 0 iff "*this" == "b",
+  //   >  0 iff "*this" >  "b"
+  int compare(const Slice& b) const;
+
+  // Return true iff "x" is a prefix of "*this"
+  bool starts_with(const Slice& x) const {
+    return ((size_ >= x.size_) &&
+            (memcmp(data_, x.data_, x.size_) == 0));
+  }
+
+ private:
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+  return ((x.size() == y.size()) &&
+          (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) {
+  return !(x == y);
+}
+
+inline int Slice::compare(const Slice& b) const {
+  const int min_len = (size_ < b.size_) ? size_ : b.size_;
+  int r = memcmp(data_, b.data_, min_len);
+  if (r == 0) {
+    if (size_ < b.size_) r = -1;
+    else if (size_ > b.size_) r = +1;
+  }
+  return r;
+}
+
+}
+
+
+#endif  // STORAGE_LEVELDB_INCLUDE_SLICE_H_
diff --git a/include/leveldb/status.h b/include/leveldb/status.h
new file mode 100644
index 0000000..47e3edf
--- /dev/null
+++ b/include/leveldb/status.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
+#define STORAGE_LEVELDB_INCLUDE_STATUS_H_
+
+#include 
+#include 
+#include "leveldb/slice.h"
+
+namespace leveldb {
+
+class Status {
+ public:
+  // Create a success status.
+  Status() : state_(NULL) { }
+  ~Status() { delete state_; }
+
+  // Copy the specified status.
+  Status(const Status& s);
+  void operator=(const Status& s);
+
+  // Return a success status.
+  static Status OK() { return Status(); }
+
+  // Return error status of an appropriate type.
+  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotFound, msg, Slice());
+  }
+  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kCorruption, msg, msg2);
+  }
+  static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotSupported, msg, msg2);
+  }
+  static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, msg, msg2);
+  }
+  static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, msg, msg2);
+  }
+
+  // Returns true iff the status indicates success.
+  bool ok() const { return (state_ == NULL); }
+
+  // Returns true iff the status indicates a NotFound error.
+  bool IsNotFound() const { return code() == kNotFound; }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  std::string ToString() const;
+
+ private:
+  enum Code {
+    kOk = 0,
+    kNotFound = 1,
+    kCorruption = 2,
+    kNotSupported = 3,
+    kInvalidArgument = 4,
+    kIOError = 5,
+  };
+  Code code() const { return (state_ == NULL) ? kOk : state_->first; }
+
+  Status(Code code, const Slice& msg, const Slice& msg2);
+
+  typedef std::pair State;
+  State* state_;
+};
+
+inline Status::Status(const Status& s) {
+  state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
+}
+inline void Status::operator=(const Status& s) {
+  if (this != &s) {
+    delete state_;
+    state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
+  }
+}
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_STATUS_H_
diff --git a/include/leveldb/table.h b/include/leveldb/table.h
new file mode 100644
index 0000000..bd99176
--- /dev/null
+++ b/include/leveldb/table.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_
+#define STORAGE_LEVELDB_INCLUDE_TABLE_H_
+
+#include 
+#include "leveldb/iterator.h"
+
+namespace leveldb {
+
+class Block;
+class BlockHandle;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.
+class Table {
+ public:
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table" to the newly opened
+  // table.  The client should delete "*table" when no longer needed.
+  // If there was an error while initializing the table, sets "*table"
+  // to NULL and returns a non-ok status.  Does not take ownership of
+  // "*source", but the client must ensure that "source" remains live
+  // for the duration of the returned table's lifetime.
+  //
+  // *file must remain live while this Table is in use.
+  static Status Open(const Options& options,
+                     RandomAccessFile* file,
+                     uint64_t file_size,
+                     Table** table);
+
+  ~Table();
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  Iterator* NewIterator(const ReadOptions&) const;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  uint64_t ApproximateOffsetOf(const Slice& key) const;
+
+ private:
+  struct Rep;
+  Rep* rep_;
+
+  explicit Table(Rep* rep) { rep_ = rep; }
+  static Iterator* BlockReader(void*, const ReadOptions&, const Slice&);
+
+  // No copying allowed
+  Table(const Table&);
+  void operator=(const Table&);
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_TABLE_H_
diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h
new file mode 100644
index 0000000..49d2d51
--- /dev/null
+++ b/include/leveldb/table_builder.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+
+#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
+#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
+
+#include 
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+
+class TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish().
+  TableBuilder(const Options& options, WritableFile* file);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~TableBuilder();
+
+  // Change the options used by this builder.  Note: only some of the
+  // option fields can be changed after construction.  If a field is
+  // not allowed to change dynamically and its value in the structure
+  // passed to the constructor is different from its value in the
+  // structure passed to this method, this method will return an error
+  // without changing any fields.
+  Status ChangeOptions(const Options& options);
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value);
+
+  // Advanced operation: flush any buffered key/value pairs to file.
+  // Can be used to ensure that two adjacent entries never live in
+  // the same data block.  Most clients should not need to use this method.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Flush();
+
+  // Return non-ok iff some error has been detected.
+  Status status() const;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish();
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon();
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const;
+
+ private:
+  bool ok() const { return status().ok(); }
+  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
+
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  TableBuilder(const TableBuilder&);
+  void operator=(const TableBuilder&);
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h
new file mode 100644
index 0000000..3411952
--- /dev/null
+++ b/include/leveldb/write_batch.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch.  For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+//    batch.Put("key", "v1");
+//    batch.Delete("key");
+//    batch.Put("key", "v2");
+//    batch.Put("key", "v3");
+
+#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
+#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
+
+#include 
+
+namespace leveldb {
+
+class Slice;
+
+class WriteBatch {
+ public:
+  WriteBatch();
+  ~WriteBatch();
+
+  // Store the mapping "key->value" in the database.
+  void Put(const Slice& key, const Slice& value);
+
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  void Delete(const Slice& key);
+
+  // Clear all updates buffered in this batch.
+  void Clear();
+
+ private:
+  friend class WriteBatchInternal;
+
+  std::string rep_;  // See comment in write_batch.cc for the format of rep_
+
+  // Intentionally copyable
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
diff --git a/include/options.h b/include/options.h
deleted file mode 100644
index 0b65624..0000000
--- a/include/options.h
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
-#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
-
-#include 
-
-namespace leveldb {
-
-class Cache;
-class Comparator;
-class Env;
-class Snapshot;
-class WritableFile;
-
-// DB contents are stored in a set of blocks, each of which holds a
-// sequence of key,value pairs.  Each block may be compressed before
-// being stored in a file.  The following enum describes which
-// compression method (if any) is used to compress a block.
-enum CompressionType {
-  // NOTE: do not change the values of existing entries, as these are
-  // part of the persistent format on disk.
-  kNoCompression     = 0x0,
-  kSnappyCompression = 0x1,
-};
-
-// Options to control the behavior of a database (passed to DB::Open)
-struct Options {
-  // -------------------
-  // Parameters that affect behavior
-
-  // Comparator used to define the order of keys in the table.
-  // Default: a comparator that uses lexicographic byte-wise ordering
-  //
-  // REQUIRES: The client must ensure that the comparator supplied
-  // here has the same name and orders keys *exactly* the same as the
-  // comparator provided to previous open calls on the same DB.
-  const Comparator* comparator;
-
-  // If true, the database will be created if it is missing.
-  // Default: false
-  bool create_if_missing;
-
-  // If true, an error is raised if the database already exists.
-  // Default: false
-  bool error_if_exists;
-
-  // If true, the implementation will do aggressive checking of the
-  // data it is processing and will stop early if it detects any
-  // errors.  This may have unforeseen ramifications: for example, a
-  // corruption of one DB entry may cause a large number of entries to
-  // become unreadable or for the entire DB to become unopenable.
-  // Default: false
-  bool paranoid_checks;
-
-  // Use the specified object to interact with the environment,
-  // e.g. to read/write files, schedule background work, etc.
-  // Default: Env::Default()
-  Env* env;
-
-  // Any internal progress/error information generated by the db will
-  // be to written to info_log if it is non-NULL, or to a file stored
-  // in the same directory as the DB contents if info_log is NULL.
-  // Default: NULL
-  WritableFile* info_log;
-
-  // -------------------
-  // Parameters that affect performance
-
-  // Amount of data to build up in memory before converting to an
-  // on-disk file.
-  //
-  // Some DB operations may encounter a delay proportional to the size
-  // of this parameter.  Therefore we recommend against increasing
-  // this parameter unless you are willing to live with an occasional
-  // slow operation in exchange for faster bulk loading throughput.
-  //
-  // Default: 1MB
-  size_t write_buffer_size;
-
-  // Number of open files that can be used by the DB.  You may need to
-  // increase this if your database has a large working set (budget
-  // one open file per 2MB of working set).
-  //
-  // Default: 1000
-  int max_open_files;
-
-  // Handle values larger than "large_value_threshold" bytes
-  // specially, by writing them into their own files (to avoid
-  // compaction overhead) and doing content-based elimination of
-  // duplicate values to save space.
-  //
-  // We recommend against changing this value.
-  //
-  // Default: 64K
-  size_t large_value_threshold;
-
-  // Control over blocks (user data is stored in a set of blocks, and
-  // a block is the unit of reading from disk).
-
-  // Use the specified cache for blocks (if non-NULL).
-  // Default: NULL
-  Cache* block_cache;
-
-  // Approximate size of user data packed per block.  Note that the
-  // block size specified here corresponds to uncompressed data.  The
-  // actual size of the unit read from disk may be smaller if
-  // compression is enabled.  This parameter can be changed dynamically.
-  //
-  // Default: 8K
-  int block_size;
-
-  // Number of keys between restart points for delta encoding of keys.
-  // This parameter can be changed dynamically.  Most clients should
-  // leave this parameter alone.
-  //
-  // Default: 16
-  int block_restart_interval;
-
-  // Compress blocks using the specified compression algorithm.  This
-  // parameter can be changed dynamically.
-  //
-  // Default: kSnappyCompression, which gives lightweight but fast
-  // compression.
-  //
-  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
-  //    ~200-500MB/s compression
-  //    ~400-800MB/s decompression
-  // Note that these speeds are significantly faster than most
-  // persistent storage speeds, and therefore it is typically never
-  // worth switching to kNoCompression.  Even if the input data is
-  // incompressible, the kSnappyCompression implementation will
-  // efficiently detect that and will switch to uncompressed mode.
-  CompressionType compression;
-
-  // Create an Options object with default values for all fields.
-  Options();
-};
-
-// Options that control read operations
-struct ReadOptions {
-  // If true, all data read from underlying storage will be
-  // verified against corresponding checksums.
-  // Default: false
-  bool verify_checksums;
-
-  // Should the data read for this iteration be cached in memory?
-  // Callers may wish to set this field to false for bulk scans.
-  // Default: true
-  bool fill_cache;
-
-  // If "snapshot" is non-NULL, read as of the supplied snapshot
-  // (which must belong to the DB that is being read and which must
-  // not have been released).  If "snapshot" is NULL, use an impliicit
-  // snapshot of the state at the beginning of this read operation.
-  // Default: NULL
-  const Snapshot* snapshot;
-
-  ReadOptions()
-      : verify_checksums(false),
-        fill_cache(true),
-        snapshot(NULL) {
-  }
-};
-
-// Options that control write operations
-struct WriteOptions {
-  // If true, the write will be flushed from the operating system
-  // buffer cache (by calling WritableFile::Sync()) before the write
-  // is considered complete.  If this flag is true, writes will be
-  // slower.
-  //
-  // If this flag is false, and the machine crashes, some recent
-  // writes may be lost.  Note that if it is just the process that
-  // crashes (i.e., the machine does not reboot), no writes will be
-  // lost even if sync==false.
-  //
-  // Default: true
-  bool sync;
-
-  // If "post_write_snapshot" is non-NULL, and the write succeeds,
-  // *post_write_snapshot will be modified to point to a snapshot of
-  // the DB state immediately after this write.  The caller must call
-  // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the
-  // snapshot is no longer needed.
-  //
-  // If "post_write_snapshot" is non-NULL, and the write fails,
-  // *post_write_snapshot will be set to NULL.
-  //
-  // Default: NULL
-  const Snapshot** post_write_snapshot;
-
-  WriteOptions()
-      : sync(true),
-        post_write_snapshot(NULL) {
-  }
-};
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
diff --git a/include/slice.h b/include/slice.h
deleted file mode 100644
index 62cb894..0000000
--- a/include/slice.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// Slice is a simple structure containing a pointer into some external
-// storage and a size.  The user of a Slice must ensure that the slice
-// is not used after the corresponding external storage has been
-// deallocated.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
-#define STORAGE_LEVELDB_INCLUDE_SLICE_H_
-
-#include 
-#include 
-#include 
-#include 
-
-namespace leveldb {
-
-class Slice {
- public:
-  // Create an empty slice.
-  Slice() : data_(""), size_(0) { }
-
-  // Create a slice that refers to data[0,n-1].
-  Slice(const char* data, size_t n) : data_(data), size_(n) { }
-
-  // Create a slice that refers to the contents of "s"
-  Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
-
-  // Create a slice that refers to s[0,strlen(s)-1]
-  Slice(const char* s) : data_(s), size_(strlen(s)) { }
-
-  // Return a pointer to the beginning of the referenced data
-  const char* data() const { return data_; }
-
-  // Return the length (in bytes) of the referenced data
-  size_t size() const { return size_; }
-
-  // Return true iff the length of the referenced data is zero
-  bool empty() const { return size_ == 0; }
-
-  // Return the ith byte in the referenced data.
-  // REQUIRES: n < size()
-  char operator[](size_t n) const {
-    assert(n < size());
-    return data_[n];
-  }
-
-  // Change this slice to refer to an empty array
-  void clear() { data_ = ""; size_ = 0; }
-
-  // Drop the first "n" bytes from this slice.
-  void remove_prefix(size_t n) {
-    assert(n <= size());
-    data_ += n;
-    size_ -= n;
-  }
-
-  // Return a string that contains the copy of the referenced data.
-  std::string ToString() const { return std::string(data_, size_); }
-
-  // Three-way comparison.  Returns value:
-  //   <  0 iff "*this" <  "b",
-  //   == 0 iff "*this" == "b",
-  //   >  0 iff "*this" >  "b"
-  int compare(const Slice& b) const;
-
-  // Return true iff "x" is a prefix of "*this"
-  bool starts_with(const Slice& x) const {
-    return ((size_ >= x.size_) &&
-            (memcmp(data_, x.data_, x.size_) == 0));
-  }
-
- private:
-  const char* data_;
-  size_t size_;
-
-  // Intentionally copyable
-};
-
-inline bool operator==(const Slice& x, const Slice& y) {
-  return ((x.size() == y.size()) &&
-          (memcmp(x.data(), y.data(), x.size()) == 0));
-}
-
-inline bool operator!=(const Slice& x, const Slice& y) {
-  return !(x == y);
-}
-
-inline int Slice::compare(const Slice& b) const {
-  const int min_len = (size_ < b.size_) ? size_ : b.size_;
-  int r = memcmp(data_, b.data_, min_len);
-  if (r == 0) {
-    if (size_ < b.size_) r = -1;
-    else if (size_ > b.size_) r = +1;
-  }
-  return r;
-}
-
-}
-
-
-#endif  // STORAGE_LEVELDB_INCLUDE_SLICE_H_
diff --git a/include/status.h b/include/status.h
deleted file mode 100644
index cd148f6..0000000
--- a/include/status.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// A Status encapsulates the result of an operation.  It may indicate success,
-// or it may indicate an error with an associated error message.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
-#define STORAGE_LEVELDB_INCLUDE_STATUS_H_
-
-#include 
-#include 
-#include "include/slice.h"
-
-namespace leveldb {
-
-class Status {
- public:
-  // Create a success status.
-  Status() : state_(NULL) { }
-  ~Status() { delete state_; }
-
-  // Copy the specified status.
-  Status(const Status& s);
-  void operator=(const Status& s);
-
-  // Return a success status.
-  static Status OK() { return Status(); }
-
-  // Return error status of an appropriate type.
-  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
-    return Status(kNotFound, msg, Slice());
-  }
-  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
-    return Status(kCorruption, msg, msg2);
-  }
-  static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
-    return Status(kNotSupported, msg, msg2);
-  }
-  static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
-    return Status(kInvalidArgument, msg, msg2);
-  }
-  static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
-    return Status(kIOError, msg, msg2);
-  }
-
-  // Returns true iff the status indicates success.
-  bool ok() const { return (state_ == NULL); }
-
-  // Returns true iff the status indicates a NotFound error.
-  bool IsNotFound() const { return code() == kNotFound; }
-
-  // Return a string representation of this status suitable for printing.
-  // Returns the string "OK" for success.
-  std::string ToString() const;
-
- private:
-  enum Code {
-    kOk = 0,
-    kNotFound = 1,
-    kCorruption = 2,
-    kNotSupported = 3,
-    kInvalidArgument = 4,
-    kIOError = 5,
-  };
-  Code code() const { return (state_ == NULL) ? kOk : state_->first; }
-
-  Status(Code code, const Slice& msg, const Slice& msg2);
-
-  typedef std::pair State;
-  State* state_;
-};
-
-inline Status::Status(const Status& s) {
-  state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
-}
-inline void Status::operator=(const Status& s) {
-  if (this != &s) {
-    delete state_;
-    state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
-  }
-}
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_STATUS_H_
diff --git a/include/table.h b/include/table.h
deleted file mode 100644
index c2a4cf9..0000000
--- a/include/table.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_
-#define STORAGE_LEVELDB_INCLUDE_TABLE_H_
-
-#include 
-#include "include/iterator.h"
-
-namespace leveldb {
-
-class Block;
-class BlockHandle;
-struct Options;
-class RandomAccessFile;
-struct ReadOptions;
-
-// A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.
-class Table {
- public:
-  // Attempt to open the table that is stored in bytes [0..file_size)
-  // of "file", and read the metadata entries necessary to allow
-  // retrieving data from the table.
-  //
-  // If successful, returns ok and sets "*table" to the newly opened
-  // table.  The client should delete "*table" when no longer needed.
-  // If there was an error while initializing the table, sets "*table"
-  // to NULL and returns a non-ok status.  Does not take ownership of
-  // "*source", but the client must ensure that "source" remains live
-  // for the duration of the returned table's lifetime.
-  //
-  // *file must remain live while this Table is in use.
-  static Status Open(const Options& options,
-                     RandomAccessFile* file,
-                     uint64_t file_size,
-                     Table** table);
-
-  ~Table();
-
-  // Returns a new iterator over the table contents.
-  // The result of NewIterator() is initially invalid (caller must
-  // call one of the Seek methods on the iterator before using it).
-  Iterator* NewIterator(const ReadOptions&) const;
-
-  // Given a key, return an approximate byte offset in the file where
-  // the data for that key begins (or would begin if the key were
-  // present in the file).  The returned value is in terms of file
-  // bytes, and so includes effects like compression of the underlying data.
-  // E.g., the approximate offset of the last key in the table will
-  // be close to the file length.
-  uint64_t ApproximateOffsetOf(const Slice& key) const;
-
- private:
-  struct Rep;
-  Rep* rep_;
-
-  explicit Table(Rep* rep) { rep_ = rep; }
-  static Iterator* BlockReader(void*, const ReadOptions&, const Slice&);
-
-  // No copying allowed
-  Table(const Table&);
-  void operator=(const Table&);
-};
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_TABLE_H_
diff --git a/include/table_builder.h b/include/table_builder.h
deleted file mode 100644
index ecd852e..0000000
--- a/include/table_builder.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// TableBuilder provides the interface used to build a Table
-// (an immutable and sorted map from keys to values).
-
-#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
-#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
-
-#include 
-#include "include/options.h"
-#include "include/status.h"
-
-namespace leveldb {
-
-class BlockBuilder;
-class BlockHandle;
-class WritableFile;
-
-class TableBuilder {
- public:
-  // Create a builder that will store the contents of the table it is
-  // building in *file.  Does not close the file.  It is up to the
-  // caller to close the file after calling Finish().
-  TableBuilder(const Options& options, WritableFile* file);
-
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  ~TableBuilder();
-
-  // Change the options used by this builder.  Note: only some of the
-  // option fields can be changed after construction.  If a field is
-  // not allowed to change dynamically and its value in the structure
-  // passed to the constructor is different from its value in the
-  // structure passed to this method, this method will return an error
-  // without changing any fields.
-  Status ChangeOptions(const Options& options);
-
-  // Add key,value to the table being constructed.
-  // REQUIRES: key is after any previously added key according to comparator.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Add(const Slice& key, const Slice& value);
-
-  // Advanced operation: flush any buffered key/value pairs to file.
-  // Can be used to ensure that two adjacent entries never live in
-  // the same data block.  Most clients should not need to use this method.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Flush();
-
-  // Return non-ok iff some error has been detected.
-  Status status() const;
-
-  // Finish building the table.  Stops using the file passed to the
-  // constructor after this function returns.
-  // REQUIRES: Finish(), Abandon() have not been called
-  Status Finish();
-
-  // Indicate that the contents of this builder should be abandoned.  Stops
-  // using the file passed to the constructor after this function returns.
-  // If the caller is not going to call Finish(), it must call Abandon()
-  // before destroying this builder.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Abandon();
-
-  // Number of calls to Add() so far.
-  uint64_t NumEntries() const;
-
-  // Size of the file generated so far.  If invoked after a successful
-  // Finish() call, returns the size of the final generated file.
-  uint64_t FileSize() const;
-
- private:
-  bool ok() const { return status().ok(); }
-  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
-
-  struct Rep;
-  Rep* rep_;
-
-  // No copying allowed
-  TableBuilder(const TableBuilder&);
-  void operator=(const TableBuilder&);
-};
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
diff --git a/include/write_batch.h b/include/write_batch.h
deleted file mode 100644
index 3411952..0000000
--- a/include/write_batch.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// WriteBatch holds a collection of updates to apply atomically to a DB.
-//
-// The updates are applied in the order in which they are added
-// to the WriteBatch.  For example, the value of "key" will be "v3"
-// after the following batch is written:
-//
-//    batch.Put("key", "v1");
-//    batch.Delete("key");
-//    batch.Put("key", "v2");
-//    batch.Put("key", "v3");
-
-#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
-#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
-
-#include 
-
-namespace leveldb {
-
-class Slice;
-
-class WriteBatch {
- public:
-  WriteBatch();
-  ~WriteBatch();
-
-  // Store the mapping "key->value" in the database.
-  void Put(const Slice& key, const Slice& value);
-
-  // If the database contains a mapping for "key", erase it.  Else do nothing.
-  void Delete(const Slice& key);
-
-  // Clear all updates buffered in this batch.
-  void Clear();
-
- private:
-  friend class WriteBatchInternal;
-
-  std::string rep_;  // See comment in write_batch.cc for the format of rep_
-
-  // Intentionally copyable
-};
-
-}
-
-#endif  // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
-- 
cgit v1.2.3


From 95e21f32367748825123e382172ecbfd492ddb23 Mon Sep 17 00:00:00 2001
From: "dgrogan@chromium.org"
 
Date: Tue, 12 Apr 2011 19:38:58 +0000
Subject: @20602303.  Default file permission is now 755.

git-svn-id: http://leveldb.googlecode.com/svn/trunk@20 62dab493-f737-651d-591e-8d6aee1b9529
---
 AUTHORS                         |   0
 LICENSE                         |   0
 Makefile                        |   0
 README                          |   0
 TODO                            |   8 --
 db/builder.cc                   |   0
 db/builder.h                    |   0
 db/corruption_test.cc           |  21 +++-
 db/db_bench.cc                  | 180 ++++++++++++++++++++-------
 db/db_impl.cc                   | 270 ++++++++++++++++++++++++++++++----------
 db/db_impl.h                    |  35 ++++--
 db/db_iter.cc                   |   0
 db/db_iter.h                    |   0
 db/db_test.cc                   |  27 ++--
 db/dbformat.cc                  |   0
 db/dbformat.h                   |   6 +
 db/dbformat_test.cc             |   0
 db/filename.cc                  |   0
 db/filename.h                   |   0
 db/filename_test.cc             |   0
 db/log_format.h                 |   0
 db/log_reader.cc                |   0
 db/log_reader.h                 |   0
 db/log_test.cc                  |   0
 db/log_writer.cc                |   0
 db/log_writer.h                 |   0
 db/memtable.cc                  |   0
 db/memtable.h                   |   0
 db/repair.cc                    |   0
 db/skiplist.h                   |   0
 db/skiplist_test.cc             |   0
 db/snapshot.h                   |   0
 db/table_cache.cc               |   0
 db/table_cache.h                |   0
 db/version_edit.cc              |  19 +++
 db/version_edit.h               |   6 +
 db/version_edit_test.cc         |   0
 db/version_set.cc               | 116 +++++++++++------
 db/version_set.h                |  35 ++++--
 db/write_batch.cc               |   0
 db/write_batch_internal.h       |   0
 db/write_batch_test.cc          |   0
 doc/doc.css                     |   0
 doc/impl.html                   |   0
 doc/index.html                  |  82 ++++++------
 doc/log_format.txt              |   0
 doc/table_format.txt            |   0
 include/leveldb/cache.h         |   0
 include/leveldb/comparator.h    |   0
 include/leveldb/db.h            |  12 +-
 include/leveldb/env.h           |   0
 include/leveldb/iterator.h      |   0
 include/leveldb/options.h       |  27 ++--
 include/leveldb/slice.h         |   0
 include/leveldb/status.h        |   0
 include/leveldb/table.h         |   0
 include/leveldb/table_builder.h |   0
 include/leveldb/write_batch.h   |   0
 leveldb.gyp                     |   0
 port/README                     |   0
 port/port.h                     |   0
 port/port_android.cc            |   1 -
 port/port_android.h             |  50 ++++++--
 port/port_chromium.cc           |   0
 port/port_chromium.h            |   0
 port/port_example.h             |   0
 port/port_posix.cc              |   0
 port/port_posix.h               |   0
 port/sha1_portable.cc           |   0
 port/sha1_portable.h            |   0
 port/sha1_test.cc               |   0
 port/win/stdint.h               |   0
 table/block.cc                  |   0
 table/block.h                   |   0
 table/block_builder.cc          |   0
 table/block_builder.h           |   0
 table/format.cc                 |   0
 table/format.h                  |   0
 table/iterator.cc               |   0
 table/iterator_wrapper.h        |   0
 table/merger.cc                 |   0
 table/merger.h                  |   0
 table/table.cc                  |   0
 table/table_builder.cc          |   0
 table/table_test.cc             |   4 +-
 table/two_level_iterator.cc     |   0
 table/two_level_iterator.h      |   0
 util/arena.cc                   |   0
 util/arena.h                    |   0
 util/arena_test.cc              |   0
 util/cache.cc                   |   0
 util/cache_test.cc              |   0
 util/coding.cc                  |   0
 util/coding.h                   |   0
 util/coding_test.cc             |   0
 util/comparator.cc              |   0
 util/crc32c.cc                  |   0
 util/crc32c.h                   |   0
 util/crc32c_test.cc             |   0
 util/env.cc                     |   0
 util/env_chromium.cc            |   0
 util/env_posix.cc               |   0
 util/env_test.cc                |   0
 util/hash.cc                    |   0
 util/hash.h                     |   0
 util/histogram.cc               |   0
 util/histogram.h                |   0
 util/logging.cc                 |   0
 util/logging.h                  |   0
 util/mutexlock.h                |   0
 util/options.cc                 |   4 +-
 util/random.h                   |   0
 util/status.cc                  |   0
 util/testharness.cc             |   0
 util/testharness.h              |   0
 util/testutil.cc                |   0
 util/testutil.h                 |   0
 117 files changed, 628 insertions(+), 275 deletions(-)
 mode change 100644 => 100755 AUTHORS
 mode change 100644 => 100755 LICENSE
 mode change 100644 => 100755 Makefile
 mode change 100644 => 100755 README
 mode change 100644 => 100755 TODO
 mode change 100644 => 100755 db/builder.cc
 mode change 100644 => 100755 db/builder.h
 mode change 100644 => 100755 db/corruption_test.cc
 mode change 100644 => 100755 db/db_bench.cc
 mode change 100644 => 100755 db/db_impl.cc
 mode change 100644 => 100755 db/db_impl.h
 mode change 100644 => 100755 db/db_iter.cc
 mode change 100644 => 100755 db/db_iter.h
 mode change 100644 => 100755 db/db_test.cc
 mode change 100644 => 100755 db/dbformat.cc
 mode change 100644 => 100755 db/dbformat.h
 mode change 100644 => 100755 db/dbformat_test.cc
 mode change 100644 => 100755 db/filename.cc
 mode change 100644 => 100755 db/filename.h
 mode change 100644 => 100755 db/filename_test.cc
 mode change 100644 => 100755 db/log_format.h
 mode change 100644 => 100755 db/log_reader.cc
 mode change 100644 => 100755 db/log_reader.h
 mode change 100644 => 100755 db/log_test.cc
 mode change 100644 => 100755 db/log_writer.cc
 mode change 100644 => 100755 db/log_writer.h
 mode change 100644 => 100755 db/memtable.cc
 mode change 100644 => 100755 db/memtable.h
 mode change 100644 => 100755 db/repair.cc
 mode change 100644 => 100755 db/skiplist.h
 mode change 100644 => 100755 db/skiplist_test.cc
 mode change 100644 => 100755 db/snapshot.h
 mode change 100644 => 100755 db/table_cache.cc
 mode change 100644 => 100755 db/table_cache.h
 mode change 100644 => 100755 db/version_edit.cc
 mode change 100644 => 100755 db/version_edit.h
 mode change 100644 => 100755 db/version_edit_test.cc
 mode change 100644 => 100755 db/version_set.cc
 mode change 100644 => 100755 db/version_set.h
 mode change 100644 => 100755 db/write_batch.cc
 mode change 100644 => 100755 db/write_batch_internal.h
 mode change 100644 => 100755 db/write_batch_test.cc
 mode change 100644 => 100755 doc/doc.css
 mode change 100644 => 100755 doc/impl.html
 mode change 100644 => 100755 doc/index.html
 mode change 100644 => 100755 doc/log_format.txt
 mode change 100644 => 100755 doc/table_format.txt
 mode change 100644 => 100755 include/leveldb/cache.h
 mode change 100644 => 100755 include/leveldb/comparator.h
 mode change 100644 => 100755 include/leveldb/db.h
 mode change 100644 => 100755 include/leveldb/env.h
 mode change 100644 => 100755 include/leveldb/iterator.h
 mode change 100644 => 100755 include/leveldb/options.h
 mode change 100644 => 100755 include/leveldb/slice.h
 mode change 100644 => 100755 include/leveldb/status.h
 mode change 100644 => 100755 include/leveldb/table.h
 mode change 100644 => 100755 include/leveldb/table_builder.h
 mode change 100644 => 100755 include/leveldb/write_batch.h
 mode change 100644 => 100755 leveldb.gyp
 mode change 100644 => 100755 port/README
 mode change 100644 => 100755 port/port.h
 mode change 100644 => 100755 port/port_android.cc
 mode change 100644 => 100755 port/port_android.h
 mode change 100644 => 100755 port/port_chromium.cc
 mode change 100644 => 100755 port/port_chromium.h
 mode change 100644 => 100755 port/port_example.h
 mode change 100644 => 100755 port/port_posix.cc
 mode change 100644 => 100755 port/port_posix.h
 mode change 100644 => 100755 port/sha1_portable.cc
 mode change 100644 => 100755 port/sha1_portable.h
 mode change 100644 => 100755 port/sha1_test.cc
 mode change 100644 => 100755 port/win/stdint.h
 mode change 100644 => 100755 table/block.cc
 mode change 100644 => 100755 table/block.h
 mode change 100644 => 100755 table/block_builder.cc
 mode change 100644 => 100755 table/block_builder.h
 mode change 100644 => 100755 table/format.cc
 mode change 100644 => 100755 table/format.h
 mode change 100644 => 100755 table/iterator.cc
 mode change 100644 => 100755 table/iterator_wrapper.h
 mode change 100644 => 100755 table/merger.cc
 mode change 100644 => 100755 table/merger.h
 mode change 100644 => 100755 table/table.cc
 mode change 100644 => 100755 table/table_builder.cc
 mode change 100644 => 100755 table/table_test.cc
 mode change 100644 => 100755 table/two_level_iterator.cc
 mode change 100644 => 100755 table/two_level_iterator.h
 mode change 100644 => 100755 util/arena.cc
 mode change 100644 => 100755 util/arena.h
 mode change 100644 => 100755 util/arena_test.cc
 mode change 100644 => 100755 util/cache.cc
 mode change 100644 => 100755 util/cache_test.cc
 mode change 100644 => 100755 util/coding.cc
 mode change 100644 => 100755 util/coding.h
 mode change 100644 => 100755 util/coding_test.cc
 mode change 100644 => 100755 util/comparator.cc
 mode change 100644 => 100755 util/crc32c.cc
 mode change 100644 => 100755 util/crc32c.h
 mode change 100644 => 100755 util/crc32c_test.cc
 mode change 100644 => 100755 util/env.cc
 mode change 100644 => 100755 util/env_chromium.cc
 mode change 100644 => 100755 util/env_posix.cc
 mode change 100644 => 100755 util/env_test.cc
 mode change 100644 => 100755 util/hash.cc
 mode change 100644 => 100755 util/hash.h
 mode change 100644 => 100755 util/histogram.cc
 mode change 100644 => 100755 util/histogram.h
 mode change 100644 => 100755 util/logging.cc
 mode change 100644 => 100755 util/logging.h
 mode change 100644 => 100755 util/mutexlock.h
 mode change 100644 => 100755 util/options.cc
 mode change 100644 => 100755 util/random.h
 mode change 100644 => 100755 util/status.cc
 mode change 100644 => 100755 util/testharness.cc
 mode change 100644 => 100755 util/testharness.h
 mode change 100644 => 100755 util/testutil.cc
 mode change 100644 => 100755 util/testutil.h

diff --git a/AUTHORS b/AUTHORS
old mode 100644
new mode 100755
diff --git a/LICENSE b/LICENSE
old mode 100644
new mode 100755
diff --git a/Makefile b/Makefile
old mode 100644
new mode 100755
diff --git a/README b/README
old mode 100644
new mode 100755
diff --git a/TODO b/TODO
old mode 100644
new mode 100755
index e17dfdb..2f848b8
--- a/TODO
+++ b/TODO
@@ -1,11 +1,3 @@
-Before adding to chrome
------------------------
-- multi-threaded test/benchmark
-- Allow missing crc32c in Table format?
-
-Maybe afterwards
-----------------
-
 ss
 - Stats
 
diff --git a/db/builder.cc b/db/builder.cc
old mode 100644
new mode 100755
diff --git a/db/builder.h b/db/builder.h
old mode 100644
new mode 100755
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
old mode 100644
new mode 100755
index de9408c..63d8d8b
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include "leveldb/cache.h"
 #include "leveldb/env.h"
 #include "leveldb/table.h"
 #include "leveldb/write_batch.h"
@@ -28,10 +29,12 @@ class CorruptionTest {
   test::ErrorEnv env_;
   Random rnd_;
   std::string dbname_;
+  Cache* tiny_cache_;
   Options options_;
   DB* db_;
 
   CorruptionTest() : rnd_(test::RandomSeed()) {
+    tiny_cache_ = NewLRUCache(100);
     options_.env = &env_;
     dbname_ = test::TmpDir() + "/db_test";
     DestroyDB(dbname_, options_);
@@ -45,6 +48,7 @@ class CorruptionTest {
   ~CorruptionTest() {
      delete db_;
      DestroyDB(dbname_, Options());
+     delete tiny_cache_;
   }
 
   Status TryReopen(Options* options = NULL) {
@@ -52,6 +56,7 @@ class CorruptionTest {
     db_ = NULL;
     Options opt = (options ? *options : options_);
     opt.env = &env_;
+    opt.block_cache = tiny_cache_;
     return DB::Open(opt, dbname_, &db_);
   }
 
@@ -160,12 +165,15 @@ class CorruptionTest {
     ASSERT_TRUE(s.ok()) << s.ToString();
   }
 
-  uint64_t Property(const std::string& name) {
-    uint64_t result;
-    if (!db_->GetProperty(name, &result)) {
-      result = ~static_cast(0);
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
     }
-    return result;
   }
 
   // Return the ith key
@@ -235,7 +243,7 @@ TEST(CorruptionTest, TableFileIndexData) {
   dbi->TEST_CompactRange(0, "", "~");
   dbi->TEST_CompactRange(1, "", "~");
 
-  Corrupt(kTableFile, -1000, 500);
+  Corrupt(kTableFile, -2000, 500);
   Reopen();
   Check(5000, 9999);
 }
@@ -327,6 +335,7 @@ TEST(CorruptionTest, CompactionInputError) {
 TEST(CorruptionTest, CompactionInputErrorParanoid) {
   Options options;
   options.paranoid_checks = true;
+  options.write_buffer_size = 1048576;
   Reopen(&options);
 
   Build(10);
diff --git a/db/db_bench.cc b/db/db_bench.cc
old mode 100644
new mode 100755
index 411493c..849ebfa
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -31,11 +31,8 @@
 //      sha1          -- repeated SHA1 computation over 4K of data
 //   Meta operations:
 //      compact     -- Compact the entire DB
+//      stats       -- Print DB stats
 //      heapprofile -- Dump a heap profile (if supported by this port)
-//      sync        -- switch to synchronous writes (not the default)
-//      nosync      -- switch to asynchronous writes (the default)
-//      tenth       -- divide N by 10 (i.e., following benchmarks are smaller)
-//      normal      -- reset N back to its normal value (1000000)
 static const char* FLAGS_benchmarks =
     "fillseq,"
     "fillsync,"
@@ -51,7 +48,9 @@ static const char* FLAGS_benchmarks =
     "readreverse,"
     "fill100K,"
     "crc32c,"
-    "sha1"
+    "sha1,"
+    "snappycomp,"
+    "snappyuncomp,"
     ;
 
 // Number of key/values to place in database
@@ -68,7 +67,12 @@ static double FLAGS_compression_ratio = 0.5;
 static bool FLAGS_histogram = false;
 
 // Number of bytes to buffer in memtable before compacting
-static int FLAGS_write_buffer_size = 1 << 20;
+// (initialized to default value by "main")
+static int FLAGS_write_buffer_size = 0;
+
+// Number of bytes to use as a cache of uncompressed data.
+// Negative means use default settings.
+static int FLAGS_cache_size = -1;
 
 namespace leveldb {
 
@@ -129,6 +133,7 @@ class Benchmark {
   double last_op_finish_;
   int64_t bytes_;
   std::string message_;
+  std::string post_message_;
   Histogram hist_;
   RandomGenerator gen_;
   Random rand_;
@@ -146,7 +151,8 @@ class Benchmark {
             static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
     fprintf(stdout, "Entries:    %d\n", num_);
     fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
-            (((kKeySize + FLAGS_value_size) * num_) / 1048576.0));
+            ((static_cast(kKeySize + FLAGS_value_size) * num_)
+             / 1048576.0));
     fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
             (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
              / 1048576.0));
@@ -164,6 +170,15 @@ class Benchmark {
     fprintf(stdout,
             "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
 #endif
+
+    // See if snappy is working by attempting to compress a compressible string
+    const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy";
+    std::string compressed;
+    if (!port::Snappy_Compress(text, sizeof(text), &compressed)) {
+      fprintf(stdout, "WARNING: Snappy compression is not enabled\n");
+    } else if (compressed.size() >= sizeof(text)) {
+      fprintf(stdout, "WARNING: Snappy compression is not effective\n");
+    }
   }
 
   void PrintEnvironment() {
@@ -225,15 +240,13 @@ class Benchmark {
 
     done_++;
     if (done_ >= next_report_) {
-      if (next_report_ < 1000) {
-        next_report_ += 100;
-      } else if (next_report_ < 10000) {
-        next_report_ += 1000;
-      } else if (next_report_ < 100000) {
-        next_report_ += 10000;
-      } else {
-        next_report_ += 100000;
-      }
+      if      (next_report_ < 1000)   next_report_ += 100;
+      else if (next_report_ < 5000)   next_report_ += 500;
+      else if (next_report_ < 10000)  next_report_ += 1000;
+      else if (next_report_ < 50000)  next_report_ += 5000;
+      else if (next_report_ < 100000) next_report_ += 10000;
+      else if (next_report_ < 500000) next_report_ += 50000;
+      else                            next_report_ += 100000;
       fprintf(stderr, "... finished %d ops%30s\r", done_, "");
       fflush(stderr);
     }
@@ -248,7 +261,7 @@ class Benchmark {
 
     if (bytes_ > 0) {
       char rate[100];
-      snprintf(rate, sizeof(rate), "%5.1f MB/s",
+      snprintf(rate, sizeof(rate), "%6.1f MB/s",
                (bytes_ / 1048576.0) / (finish - start_));
       if (!message_.empty()) {
         message_  = std::string(rate) + " " + message_;
@@ -266,6 +279,11 @@ class Benchmark {
       fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
     }
     fflush(stdout);
+
+    if (!post_message_.empty()) {
+      fprintf(stdout, "\n%s\n", post_message_.c_str());
+      post_message_.clear();
+    }
   }
 
  public:
@@ -278,12 +296,13 @@ class Benchmark {
     EXISTING
   };
 
-  Benchmark() : cache_(NewLRUCache(200<<20)),
-                db_(NULL),
-                num_(FLAGS_num),
-                heap_counter_(0),
-                bytes_(0),
-                rand_(301) {
+  Benchmark()
+  : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
+    db_(NULL),
+    num_(FLAGS_num),
+    heap_counter_(0),
+    bytes_(0),
+    rand_(301) {
     std::vector files;
     Env::Default()->GetChildren("/tmp/dbbench", &files);
     for (int i = 0; i < files.size(); i++) {
@@ -318,36 +337,54 @@ class Benchmark {
       Start();
 
       WriteOptions write_options;
-      write_options.sync = false;
+      bool known = true;
       if (name == Slice("fillseq")) {
-        Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size);
+        Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
+      } else if (name == Slice("fillbatch")) {
+        Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000);
       } else if (name == Slice("fillrandom")) {
-        Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size);
+        Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1);
       } else if (name == Slice("overwrite")) {
-        Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size);
+        Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1);
       } else if (name == Slice("fillsync")) {
         write_options.sync = true;
-        Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size);
+        Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1);
       } else if (name == Slice("fill100K")) {
-        Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000);
+        Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1);
       } else if (name == Slice("readseq")) {
         ReadSequential();
       } else if (name == Slice("readreverse")) {
         ReadReverse();
       } else if (name == Slice("readrandom")) {
         ReadRandom();
+      } else if (name == Slice("readrandomsmall")) {
+        int n = num_;
+        num_ /= 1000;
+        ReadRandom();
+        num_ = n;
       } else if (name == Slice("compact")) {
         Compact();
       } else if (name == Slice("crc32c")) {
         Crc32c(4096, "(4K per op)");
       } else if (name == Slice("sha1")) {
         SHA1(4096, "(4K per op)");
+      } else if (name == Slice("snappycomp")) {
+        SnappyCompress();
+      } else if (name == Slice("snappyuncomp")) {
+        SnappyUncompress();
       } else if (name == Slice("heapprofile")) {
         HeapProfile();
+      } else if (name == Slice("stats")) {
+        PrintStats();
       } else {
-        fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+        known = false;
+        if (name != Slice()) {  // No error message for empty name
+          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+        }
+      }
+      if (known) {
+        Stop(name);
       }
-      Stop(name);
     }
   }
 
@@ -387,11 +424,54 @@ class Benchmark {
     message_ = label;
   }
 
+  void SnappyCompress() {
+    Slice input = gen_.Generate(Options().block_size);
+    int64_t bytes = 0;
+    int64_t produced = 0;
+    bool ok = true;
+    std::string compressed;
+    while (ok && bytes < 1024 * 1048576) {  // Compress 1G
+      ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
+      produced += compressed.size();
+      bytes += input.size();
+      FinishedSingleOp();
+    }
+
+    if (!ok) {
+      message_ = "(snappy failure)";
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "(output: %.1f%%)",
+               (produced * 100.0) / bytes);
+      message_ = buf;
+      bytes_ = bytes;
+    }
+  }
+
+  void SnappyUncompress() {
+    Slice input = gen_.Generate(Options().block_size);
+    std::string compressed;
+    bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
+    int64_t bytes = 0;
+    std::string uncompressed;
+    while (ok && bytes < 1024 * 1048576) {  // Compress 1G
+      ok =  port::Snappy_Uncompress(compressed.data(), compressed.size(),
+                                    &uncompressed);
+      bytes += uncompressed.size();
+      FinishedSingleOp();
+    }
+
+    if (!ok) {
+      message_ = "(snappy failure)";
+    } else {
+      bytes_ = bytes;
+    }
+  }
+
   void Open() {
     assert(db_ == NULL);
     Options options;
     options.create_if_missing = true;
-    options.max_open_files = 10000;
     options.block_cache = cache_;
     options.write_buffer_size = FLAGS_write_buffer_size;
     Status s = DB::Open(options, "/tmp/dbbench", &db_);
@@ -402,7 +482,7 @@ class Benchmark {
   }
 
   void Write(const WriteOptions& options, Order order, DBState state,
-             int num_entries, int value_size) {
+             int num_entries, int value_size, int entries_per_batch) {
     if (state == FRESH) {
       delete db_;
       db_ = NULL;
@@ -420,19 +500,21 @@ class Benchmark {
     WriteBatch batch;
     Status s;
     std::string val;
-    for (int i = 0; i < num_entries; i++) {
-      const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
-      char key[100];
-      snprintf(key, sizeof(key), "%016d", k);
+    for (int i = 0; i < num_entries; i += entries_per_batch) {
       batch.Clear();
-      batch.Put(key, gen_.Generate(value_size));
+      for (int j = 0; j < entries_per_batch; j++) {
+        const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num);
+        char key[100];
+        snprintf(key, sizeof(key), "%016d", k);
+        batch.Put(key, gen_.Generate(value_size));
+        bytes_ += value_size + strlen(key);
+        FinishedSingleOp();
+      }
       s = db_->Write(options, &batch);
-      bytes_ += value_size + strlen(key);
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      FinishedSingleOp();
     }
   }
 
@@ -475,10 +557,10 @@ class Benchmark {
     dbi->TEST_CompactMemTable();
     int max_level_with_files = 1;
     for (int level = 1; level < config::kNumLevels; level++) {
-      uint64_t v;
+      std::string property;
       char name[100];
       snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
-      if (db_->GetProperty(name, &v) && v > 0) {
+      if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) {
         max_level_with_files = level;
       }
     }
@@ -487,6 +569,15 @@ class Benchmark {
     }
   }
 
+  void PrintStats() {
+    std::string stats;
+    if (!db_->GetProperty("leveldb.stats", &stats)) {
+      message_ = "(failed)";
+    } else {
+      post_message_ = stats;
+    }
+  }
+
   static void WriteToFile(void* arg, const char* buf, int n) {
     reinterpret_cast(arg)->Append(Slice(buf, n));
   }
@@ -512,6 +603,7 @@ class Benchmark {
 }
 
 int main(int argc, char** argv) {
+  FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
   for (int i = 1; i < argc; i++) {
     double d;
     int n;
@@ -529,7 +621,9 @@ int main(int argc, char** argv) {
       FLAGS_value_size = n;
     } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
       FLAGS_write_buffer_size = n;
-    }  else {
+    } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
+      FLAGS_cache_size = n;
+    } else {
       fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
       exit(1);
     }
diff --git a/db/db_impl.cc b/db/db_impl.cc
old mode 100644
new mode 100755
index cf5471b..d012236
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -104,6 +104,9 @@ Options SanitizeOptions(const std::string& dbname,
       result.info_log = new NullWritableFile;
     }
   }
+  if (result.block_cache == NULL) {
+    result.block_cache = NewLRUCache(8 << 20);
+  }
   return result;
 }
 
@@ -112,18 +115,20 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       internal_comparator_(options.comparator),
       options_(SanitizeOptions(dbname, &internal_comparator_, options)),
       owns_info_log_(options_.info_log != options.info_log),
+      owns_cache_(options_.block_cache != options.block_cache),
       dbname_(dbname),
       db_lock_(NULL),
       shutting_down_(NULL),
       bg_cv_(&mutex_),
       compacting_cv_(&mutex_),
-      last_sequence_(0),
       mem_(new MemTable(internal_comparator_)),
+      imm_(NULL),
       logfile_(NULL),
       log_(NULL),
-      log_number_(0),
       bg_compaction_scheduled_(false),
       compacting_(false) {
+  has_imm_.Release_Store(NULL);
+
   // Reserve ten files or so for other uses and give the rest to TableCache.
   const int table_cache_size = options.max_open_files - 10;
   table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
@@ -149,6 +154,7 @@ DBImpl::~DBImpl() {
 
   delete versions_;
   delete mem_;
+  delete imm_;
   delete log_;
   delete logfile_;
   delete table_cache_;
@@ -156,15 +162,15 @@ DBImpl::~DBImpl() {
   if (owns_info_log_) {
     delete options_.info_log;
   }
+  if (owns_cache_) {
+    delete options_.block_cache;
+  }
 }
 
 Status DBImpl::NewDB() {
-  assert(log_number_ == 0);
-  assert(last_sequence_ == 0);
-
   VersionEdit new_db;
   new_db.SetComparatorName(user_comparator()->Name());
-  new_db.SetLogNumber(log_number_);
+  new_db.SetLogNumber(0);
   new_db.SetNextFile(2);
   new_db.SetLastSequence(0);
 
@@ -193,15 +199,6 @@ Status DBImpl::NewDB() {
   return s;
 }
 
-Status DBImpl::Install(VersionEdit* edit,
-                       uint64_t new_log_number,
-                       MemTable* cleanup_mem) {
-  mutex_.AssertHeld();
-  edit->SetLogNumber(new_log_number);
-  edit->SetLastSequence(last_sequence_);
-  return versions_->LogAndApply(edit, cleanup_mem);
-}
-
 void DBImpl::MaybeIgnoreError(Status* s) const {
   if (s->ok() || options_.paranoid_checks) {
     // No change needed
@@ -216,7 +213,7 @@ void DBImpl::DeleteObsoleteFiles() {
   std::set live = pending_outputs_;
   versions_->AddLiveFiles(&live);
 
-  versions_->CleanupLargeValueRefs(live, log_number_);
+  versions_->CleanupLargeValueRefs(live);
 
   std::vector filenames;
   env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
@@ -228,7 +225,8 @@ void DBImpl::DeleteObsoleteFiles() {
       bool keep = true;
       switch (type) {
         case kLogFile:
-          keep = (number == log_number_);
+          keep = ((number == versions_->LogNumber()) ||
+                  (number == versions_->PrevLogNumber()));
           break;
         case kDescriptorFile:
           // Keep my manifest file, and any newer incarnations'
@@ -296,16 +294,20 @@ Status DBImpl::Recover(VersionEdit* edit) {
     }
   }
 
-  s = versions_->Recover(&log_number_, &last_sequence_);
+  s = versions_->Recover();
   if (s.ok()) {
-    // Recover from the log file named in the descriptor
+    // Recover from the log files named in the descriptor
     SequenceNumber max_sequence(0);
-    if (log_number_ != 0) {  // log_number_ == 0 indicates initial empty state
-      s = RecoverLogFile(log_number_, edit, &max_sequence);
+    if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log
+      s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence);
+    }
+    if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state
+      s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence);
     }
     if (s.ok()) {
-      last_sequence_ =
-          last_sequence_ > max_sequence ? last_sequence_ : max_sequence;
+      if (versions_->LastSequence() < max_sequence) {
+        versions_->SetLastSequence(max_sequence);
+      }
     }
   }
 
@@ -407,56 +409,58 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
 
 Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) {
   mutex_.AssertHeld();
+  const uint64_t start_micros = env_->NowMicros();
   FileMetaData meta;
   meta.number = versions_->NewFileNumber();
   pending_outputs_.insert(meta.number);
   Iterator* iter = mem->NewIterator();
   Log(env_, options_.info_log, "Level-0 table #%llu: started",
       (unsigned long long) meta.number);
-  Status s = BuildTable(dbname_, env_, options_, table_cache_,
-                        iter, &meta, edit);
+
+  Status s;
+  {
+    mutex_.Unlock();
+    s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit);
+    mutex_.Lock();
+  }
+
   Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s",
       (unsigned long long) meta.number,
       (unsigned long long) meta.file_size,
       s.ToString().c_str());
   delete iter;
   pending_outputs_.erase(meta.number);
+
+  CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros;
+  stats.bytes_written = meta.file_size;
+  stats_[0].Add(stats);
   return s;
 }
 
 Status DBImpl::CompactMemTable() {
   mutex_.AssertHeld();
-
-  WritableFile* lfile = NULL;
-  uint64_t new_log_number = versions_->NewFileNumber();
-
-  VersionEdit edit;
+  assert(imm_ != NULL);
+  assert(compacting_);
 
   // Save the contents of the memtable as a new Table
-  Status s = WriteLevel0Table(mem_, &edit);
-  if (s.ok()) {
-    s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
-  }
+  VersionEdit edit;
+  Status s = WriteLevel0Table(imm_, &edit);
 
-  // Save a new descriptor with the new table and log number.
+  // Replace immutable memtable with the generated Table
   if (s.ok()) {
-    s = Install(&edit, new_log_number, mem_);
+    edit.SetPrevLogNumber(0);
+    s = versions_->LogAndApply(&edit, imm_);
   }
 
   if (s.ok()) {
     // Commit to the new state
-    mem_ = new MemTable(internal_comparator_);
-    delete log_;
-    delete logfile_;
-    logfile_ = lfile;
-    log_ = new log::Writer(lfile);
-    log_number_ = new_log_number;
+    imm_ = NULL;
+    has_imm_.Release_Store(NULL);
     DeleteObsoleteFiles();
-    MaybeScheduleCompaction();
-  } else {
-    delete lfile;
-    env_->DeleteFile(LogFileName(dbname_, new_log_number));
   }
+
+  compacting_cv_.SignalAll();  // Wake up waiter even if there was an error
   return s;
 }
 
@@ -485,7 +489,17 @@ void DBImpl::TEST_CompactRange(
 
 Status DBImpl::TEST_CompactMemTable() {
   MutexLock l(&mutex_);
-  return CompactMemTable();
+  Status s = MakeRoomForWrite(true /* force compaction */);
+  if (s.ok()) {
+    // Wait until the compaction completes
+    while (imm_ != NULL && bg_error_.ok()) {
+      compacting_cv_.Wait();
+    }
+    if (imm_ != NULL) {
+      s = bg_error_;
+    }
+  }
+  return s;
 }
 
 void DBImpl::MaybeScheduleCompaction() {
@@ -496,7 +510,7 @@ void DBImpl::MaybeScheduleCompaction() {
     // Some other thread is running a compaction.  Do not conflict with it.
   } else if (shutting_down_.Acquire_Load()) {
     // DB is being deleted; no more background compactions
-  } else if (!versions_->NeedsCompaction()) {
+  } else if (imm_ == NULL && !versions_->NeedsCompaction()) {
     // No work to be done
   } else {
     bg_compaction_scheduled_ = true;
@@ -525,6 +539,16 @@ void DBImpl::BackgroundCall() {
 
 void DBImpl::BackgroundCompaction() {
   mutex_.AssertHeld();
+  assert(!compacting_);
+
+  if (imm_ != NULL) {
+    compacting_ = true;
+    CompactMemTable();
+    compacting_ = false;
+    compacting_cv_.SignalAll();
+    return;
+  }
+
   Compaction* c = versions_->PickCompaction();
   if (c == NULL) {
     // Nothing to do
@@ -539,7 +563,7 @@ void DBImpl::BackgroundCompaction() {
     c->edit()->DeleteFile(c->level(), f->number);
     c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
                        f->smallest, f->largest);
-    status = Install(c->edit(), log_number_, NULL);
+    status = versions_->LogAndApply(c->edit(), NULL);
     Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
         static_cast(f->number),
         c->level() + 1,
@@ -680,7 +704,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
   }
   compact->outputs.clear();
 
-  Status s = Install(compact->compaction->edit(), log_number_, NULL);
+  Status s = versions_->LogAndApply(compact->compaction->edit(), NULL);
   if (s.ok()) {
     compact->compaction->ReleaseInputs();
     DeleteObsoleteFiles();
@@ -694,6 +718,9 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
 }
 
 Status DBImpl::DoCompactionWork(CompactionState* compact) {
+  const uint64_t start_micros = env_->NowMicros();
+  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
+
   Log(env_, options_.info_log,  "Compacting %d@%d + %d@%d files",
       compact->compaction->num_input_files(0),
       compact->compaction->level(),
@@ -704,7 +731,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
   assert(compact->builder == NULL);
   assert(compact->outfile == NULL);
   if (snapshots_.empty()) {
-    compact->smallest_snapshot = last_sequence_;
+    compact->smallest_snapshot = versions_->LastSequence();
   } else {
     compact->smallest_snapshot = snapshots_.oldest()->number_;
   }
@@ -721,6 +748,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
   bool has_current_user_key = false;
   SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
   for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
+    // Prioritize immutable compaction work
+    if (has_imm_.NoBarrier_Load() != NULL) {
+      const uint64_t imm_start = env_->NowMicros();
+      mutex_.Lock();
+      if (imm_ != NULL) {
+        CompactMemTable();
+        compacting_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
+      }
+      mutex_.Unlock();
+      imm_micros += (env_->NowMicros() - imm_start);
+    }
+
     Slice key = input->key();
     InternalKey tmp_internal_key;
     tmp_internal_key.DecodeFrom(key);
@@ -835,7 +874,19 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
   delete input;
   input = NULL;
 
+  CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros - imm_micros;
+  for (int which = 0; which < 2; which++) {
+    for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
+      stats.bytes_read += compact->compaction->input(which, i)->file_size;
+    }
+  }
+  for (int i = 0; i < compact->outputs.size(); i++) {
+    stats.bytes_written += compact->outputs[i].file_size;
+  }
+
   mutex_.Lock();
+  stats_[compact->compaction->level() + 1].Add(stats);
 
   if (status.ok()) {
     status = InstallCompactionResults(compact);
@@ -848,11 +899,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
 Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SequenceNumber* latest_snapshot) {
   mutex_.Lock();
-  *latest_snapshot = last_sequence_;
+  *latest_snapshot = versions_->LastSequence();
 
   // Collect together all needed child iterators
   std::vector list;
   list.push_back(mem_->NewIterator());
+  if (imm_ != NULL) {
+    list.push_back(imm_->NewIterator());
+  }
   versions_->current()->AddIterators(options, &list);
   Iterator* internal_iter =
       NewMergingIterator(&internal_comparator_, &list[0], list.size());
@@ -912,7 +966,7 @@ void DBImpl::Unref(void* arg1, void* arg2) {
 
 const Snapshot* DBImpl::GetSnapshot() {
   MutexLock l(&mutex_);
-  return snapshots_.New(last_sequence_);
+  return snapshots_.New(versions_->LastSequence());
 }
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
@@ -935,17 +989,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
   WriteBatch* final = NULL;
   {
     MutexLock l(&mutex_);
-    if (!bg_error_.ok()) {
-      status = bg_error_;
-    } else if (mem_->ApproximateMemoryUsage() > options_.write_buffer_size) {
-      status = CompactMemTable();
-    }
+    status = MakeRoomForWrite(false);  // May temporarily release lock and wait
+
+    uint64_t last_sequence = versions_->LastSequence();
     if (status.ok()) {
-      status = HandleLargeValues(last_sequence_ + 1, updates, &final);
+      status = HandleLargeValues(last_sequence + 1, updates, &final);
     }
     if (status.ok()) {
-      WriteBatchInternal::SetSequence(final, last_sequence_ + 1);
-      last_sequence_ += WriteBatchInternal::Count(final);
+      WriteBatchInternal::SetSequence(final, last_sequence + 1);
+      last_sequence += WriteBatchInternal::Count(final);
+      versions_->SetLastSequence(last_sequence);
 
       // Add to log and apply to memtable
       status = log_->AddRecord(WriteBatchInternal::Contents(final));
@@ -959,7 +1012,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
 
     if (options.post_write_snapshot != NULL) {
       *options.post_write_snapshot =
-          status.ok() ? snapshots_.New(last_sequence_) : NULL;
+          status.ok() ? snapshots_.New(last_sequence) : NULL;
     }
   }
   if (final != updates) {
@@ -969,6 +1022,54 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
   return status;
 }
 
+Status DBImpl::MakeRoomForWrite(bool force) {
+  mutex_.AssertHeld();
+  Status s;
+  while (true) {
+    if (!bg_error_.ok()) {
+      // Yield previous error
+      s = bg_error_;
+      break;
+    } else if (!force &&
+               (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
+      // There is room in current memtable
+      break;
+    } else if (imm_ != NULL) {
+      // We have filled up the current memtable, but the previous
+      // one is still being compacted, so we wait.
+      compacting_cv_.Wait();
+    } else {
+      // Attempt to switch to a new memtable and trigger compaction of old
+      assert(versions_->PrevLogNumber() == 0);
+      uint64_t new_log_number = versions_->NewFileNumber();
+      WritableFile* lfile = NULL;
+      s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
+      if (!s.ok()) {
+        break;
+      }
+      VersionEdit edit;
+      edit.SetPrevLogNumber(versions_->LogNumber());
+      edit.SetLogNumber(new_log_number);
+      s = versions_->LogAndApply(&edit, NULL);
+      if (!s.ok()) {
+        delete lfile;
+        env_->DeleteFile(LogFileName(dbname_, new_log_number));
+        break;
+      }
+      delete log_;
+      delete logfile_;
+      logfile_ = lfile;
+      log_ = new log::Writer(lfile);
+      imm_ = mem_;
+      has_imm_.Release_Store(imm_);
+      mem_ = new MemTable(internal_comparator_);
+      force = false;   // Do not force another compaction if have room
+      MaybeScheduleCompaction();
+    }
+  }
+  return s;
+}
+
 bool DBImpl::HasLargeValues(const WriteBatch& batch) const {
   if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) {
     for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) {
@@ -1033,9 +1134,10 @@ Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq,
             MaybeCompressLargeValue(
                 it.value(), &file_bytes, &scratch, &large_ref);
             InternalKey ikey(it.key(), seq, kTypeLargeValueRef);
-            if (versions_->RegisterLargeValueRef(large_ref, log_number_,ikey)) {
+            if (versions_->RegisterLargeValueRef(
+                    large_ref, versions_->LogNumber(), ikey)) {
               // TODO(opt): avoid holding the lock here (but be careful about
-              // another thread doing a Write and changing log_number_ or
+              // another thread doing a Write and switching logs or
               // having us get a different "assigned_seq" value).
 
               uint64_t tmp_number = versions_->NewFileNumber();
@@ -1086,7 +1188,9 @@ Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq,
   return Status::OK();
 }
 
-bool DBImpl::GetProperty(const Slice& property, uint64_t* value) {
+bool DBImpl::GetProperty(const Slice& property, std::string* value) {
+  value->clear();
+
   MutexLock l(&mutex_);
   Slice in = property;
   Slice prefix("leveldb.");
@@ -1100,10 +1204,37 @@ bool DBImpl::GetProperty(const Slice& property, uint64_t* value) {
     if (!ok || level < 0 || level >= config::kNumLevels) {
       return false;
     } else {
-      *value = versions_->NumLevelFiles(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%d", versions_->NumLevelFiles(level));
+      *value = buf;
       return true;
     }
+  } else if (in == "stats") {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "                               Compactions\n"
+             "Level  Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
+             "--------------------------------------------------\n"
+             );
+    value->append(buf);
+    for (int level = 0; level < config::kNumLevels; level++) {
+      int files = versions_->NumLevelFiles(level);
+      if (stats_[level].micros > 0 || files > 0) {
+        snprintf(
+            buf, sizeof(buf),
+            "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
+            level,
+            files,
+            versions_->NumLevelBytes(level) / 1048576.0,
+            stats_[level].micros / 1e6,
+            stats_[level].bytes_read / 1048576.0,
+            stats_[level].bytes_written / 1048576.0);
+        value->append(buf);
+      }
+    }
+    return true;
   }
+
   return false;
 }
 
@@ -1158,14 +1289,15 @@ Status DB::Open(const Options& options, const std::string& dbname,
   VersionEdit edit;
   Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
   if (s.ok()) {
-    impl->log_number_ = impl->versions_->NewFileNumber();
+    uint64_t new_log_number = impl->versions_->NewFileNumber();
     WritableFile* lfile;
-    s = options.env->NewWritableFile(LogFileName(dbname, impl->log_number_),
+    s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
                                      &lfile);
     if (s.ok()) {
+      edit.SetLogNumber(new_log_number);
       impl->logfile_ = lfile;
       impl->log_ = new log::Writer(lfile);
-      s = impl->Install(&edit, impl->log_number_, NULL);
+      s = impl->versions_->LogAndApply(&edit, NULL);
     }
     if (s.ok()) {
       impl->DeleteObsoleteFiles();
diff --git a/db/db_impl.h b/db/db_impl.h
old mode 100644
new mode 100755
index 49ac37b..1f685f0
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -36,7 +36,7 @@ class DBImpl : public DB {
   virtual Iterator* NewIterator(const ReadOptions&);
   virtual const Snapshot* GetSnapshot();
   virtual void ReleaseSnapshot(const Snapshot* snapshot);
-  virtual bool GetProperty(const Slice& property, uint64_t* value);
+  virtual bool GetProperty(const Slice& property, std::string* value);
   virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
 
   // Extra methods (for testing) that are not in the public DB interface
@@ -72,14 +72,6 @@ class DBImpl : public DB {
   // be made to the descriptor are added to *edit.
   Status Recover(VersionEdit* edit);
 
-  // Apply the specified updates and save the resulting descriptor to
-  // persistent storage.  If cleanup_mem is non-NULL, arrange to
-  // delete it when all existing snapshots have gone away iff Install()
-  // returns OK.
-  Status Install(VersionEdit* edit,
-                 uint64_t new_log_number,
-                 MemTable* cleanup_mem);
-
   void MaybeIgnoreError(Status* s) const;
 
   // Delete any unneeded files and stale in-memory entries.
@@ -99,6 +91,7 @@ class DBImpl : public DB {
 
   Status WriteLevel0Table(MemTable* mem, VersionEdit* edit);
 
+  Status MakeRoomForWrite(bool force /* compact even if there is room? */);
   bool HasLargeValues(const WriteBatch& batch) const;
 
   // Process data in "*updates" and return a status.  "assigned_seq"
@@ -141,6 +134,7 @@ class DBImpl : public DB {
   const InternalKeyComparator internal_comparator_;
   const Options options_;  // options_.comparator == &internal_comparator_
   bool owns_info_log_;
+  bool owns_cache_;
   const std::string dbname_;
 
   // table_cache_ provides its own synchronization
@@ -152,13 +146,13 @@ class DBImpl : public DB {
   // State below is protected by mutex_
   port::Mutex mutex_;
   port::AtomicPointer shutting_down_;
-  port::CondVar bg_cv_;         // Signalled when !bg_compaction_scheduled_
+  port::CondVar bg_cv_;          // Signalled when !bg_compaction_scheduled_
   port::CondVar compacting_cv_;  // Signalled when !compacting_
-  SequenceNumber last_sequence_;
   MemTable* mem_;
+  MemTable* imm_;                // Memtable being compacted
+  port::AtomicPointer has_imm_;  // So bg thread can detect non-NULL imm_
   WritableFile* logfile_;
   log::Writer* log_;
-  uint64_t log_number_;
   SnapshotList snapshots_;
 
   // Set of table files to protect from deletion because they are
@@ -176,6 +170,23 @@ class DBImpl : public DB {
   // Have we encountered a background error in paranoid mode?
   Status bg_error_;
 
+  // Per level compaction stats.  stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    int64_t micros;
+    int64_t bytes_read;
+    int64_t bytes_written;
+
+    CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { }
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->bytes_read += c.bytes_read;
+      this->bytes_written += c.bytes_written;
+    }
+  };
+  CompactionStats stats_[config::kNumLevels];
+
   // No copying allowed
   DBImpl(const DBImpl&);
   void operator=(const DBImpl&);
diff --git a/db/db_iter.cc b/db/db_iter.cc
old mode 100644
new mode 100755
diff --git a/db/db_iter.h b/db/db_iter.h
old mode 100644
new mode 100755
diff --git a/db/db_test.cc b/db/db_test.cc
old mode 100644
new mode 100755
index f68e759..04de331
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -72,19 +72,11 @@ class DBTest {
   }
 
   Status Put(const std::string& k, const std::string& v) {
-    WriteOptions options;
-    options.sync = false;
-    WriteBatch batch;
-    batch.Put(k, v);
-    return db_->Write(options, &batch);
+    return db_->Put(WriteOptions(), k, v);
   }
 
   Status Delete(const std::string& k) {
-    WriteOptions options;
-    options.sync = false;
-    WriteBatch batch;
-    batch.Delete(k);
-    return db_->Write(options, &batch);
+    return db_->Delete(WriteOptions(), k);
   }
 
   std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
@@ -147,11 +139,11 @@ class DBTest {
   }
 
   int NumTableFilesAtLevel(int level) {
-    uint64_t val;
+    std::string property;
     ASSERT_TRUE(
         db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
-                         &val));
-    return val;
+                         &property));
+    return atoi(property.c_str());
   }
 
   uint64_t Size(const Slice& start, const Slice& limit) {
@@ -185,10 +177,7 @@ class DBTest {
     dbfull()->TEST_CompactMemTable();
     int max_level_with_files = 1;
     for (int level = 1; level < config::kNumLevels; level++) {
-      uint64_t v;
-      char name[100];
-      snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
-      if (dbfull()->GetProperty(name, &v) && v > 0) {
+      if (NumTableFilesAtLevel(level) > 0) {
         max_level_with_files = level;
       }
     }
@@ -459,7 +448,7 @@ TEST(DBTest, MinorCompactionsHappen) {
   options.write_buffer_size = 10000;
   Reopen(&options);
 
-  const int N = 100;
+  const int N = 500;
 
   int starting_num_tables = NumTableFilesAtLevel(0);
   for (int i = 0; i < N; i++) {
@@ -1047,7 +1036,7 @@ class ModelDB: public DB {
     return Status::OK();
   }
 
-  virtual bool GetProperty(const Slice& property, uint64_t* value) {
+  virtual bool GetProperty(const Slice& property, std::string* value) {
     return false;
   }
   virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
diff --git a/db/dbformat.cc b/db/dbformat.cc
old mode 100644
new mode 100755
diff --git a/db/dbformat.h b/db/dbformat.h
old mode 100644
new mode 100755
index 6f34cd1..5f117f9
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -15,6 +15,12 @@
 
 namespace leveldb {
 
+// Grouping of constants.  We may want to make some of these
+// parameters set via options.
+namespace config {
+static const int kNumLevels = 7;
+}
+
 class InternalKey;
 
 // Value types encoded as the last component of internal keys.
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
old mode 100644
new mode 100755
diff --git a/db/filename.cc b/db/filename.cc
old mode 100644
new mode 100755
diff --git a/db/filename.h b/db/filename.h
old mode 100644
new mode 100755
diff --git a/db/filename_test.cc b/db/filename_test.cc
old mode 100644
new mode 100755
diff --git a/db/log_format.h b/db/log_format.h
old mode 100644
new mode 100755
diff --git a/db/log_reader.cc b/db/log_reader.cc
old mode 100644
new mode 100755
diff --git a/db/log_reader.h b/db/log_reader.h
old mode 100644
new mode 100755
diff --git a/db/log_test.cc b/db/log_test.cc
old mode 100644
new mode 100755
diff --git a/db/log_writer.cc b/db/log_writer.cc
old mode 100644
new mode 100755
diff --git a/db/log_writer.h b/db/log_writer.h
old mode 100644
new mode 100755
diff --git a/db/memtable.cc b/db/memtable.cc
old mode 100644
new mode 100755
diff --git a/db/memtable.h b/db/memtable.h
old mode 100644
new mode 100755
diff --git a/db/repair.cc b/db/repair.cc
old mode 100644
new mode 100755
diff --git a/db/skiplist.h b/db/skiplist.h
old mode 100644
new mode 100755
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
old mode 100644
new mode 100755
diff --git a/db/snapshot.h b/db/snapshot.h
old mode 100644
new mode 100755
diff --git a/db/table_cache.cc b/db/table_cache.cc
old mode 100644
new mode 100755
diff --git a/db/table_cache.h b/db/table_cache.h
old mode 100644
new mode 100755
diff --git a/db/version_edit.cc b/db/version_edit.cc
old mode 100644
new mode 100755
index 809dd82..689dbe0
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -20,15 +20,18 @@ enum Tag {
   kDeletedFile          = 6,
   kNewFile              = 7,
   kLargeValueRef        = 8,
+  kPrevLogNumber        = 9,
 };
 
 void VersionEdit::Clear() {
   comparator_.clear();
   log_number_ = 0;
+  prev_log_number_ = 0;
   last_sequence_ = 0;
   next_file_number_ = 0;
   has_comparator_ = false;
   has_log_number_ = false;
+  has_prev_log_number_ = false;
   has_next_file_number_ = false;
   has_last_sequence_ = false;
   deleted_files_.clear();
@@ -45,6 +48,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
     PutVarint32(dst, kLogNumber);
     PutVarint64(dst, log_number_);
   }
+  if (has_prev_log_number_) {
+    PutVarint32(dst, kPrevLogNumber);
+    PutVarint64(dst, prev_log_number_);
+  }
   if (has_next_file_number_) {
     PutVarint32(dst, kNextFileNumber);
     PutVarint64(dst, next_file_number_);
@@ -142,6 +149,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         }
         break;
 
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
       case kNextFileNumber:
         if (GetVarint64(&input, &next_file_number_)) {
           has_next_file_number_ = true;
@@ -228,6 +243,10 @@ std::string VersionEdit::DebugString() const {
     r.append("\n  LogNumber: ");
     AppendNumberTo(&r, log_number_);
   }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
   if (has_next_file_number_) {
     r.append("\n  NextFile: ");
     AppendNumberTo(&r, next_file_number_);
diff --git a/db/version_edit.h b/db/version_edit.h
old mode 100644
new mode 100755
index 1b71283..7e417b5
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -39,6 +39,10 @@ class VersionEdit {
     has_log_number_ = true;
     log_number_ = num;
   }
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
   void SetNextFile(uint64_t num) {
     has_next_file_number_ = true;
     next_file_number_ = num;
@@ -95,10 +99,12 @@ class VersionEdit {
 
   std::string comparator_;
   uint64_t log_number_;
+  uint64_t prev_log_number_;
   uint64_t next_file_number_;
   SequenceNumber last_sequence_;
   bool has_comparator_;
   bool has_log_number_;
+  bool has_prev_log_number_;
   bool has_next_file_number_;
   bool has_last_sequence_;
 
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
old mode 100644
new mode 100755
diff --git a/db/version_set.cc b/db/version_set.cc
old mode 100644
new mode 100755
index dc9b418..31f79bb
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -27,16 +27,14 @@ static const int kTargetFileSize = 2 * 1048576;
 static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize;
 
 static double MaxBytesForLevel(int level) {
-  if (level == 0) {
-    return 4 * 1048576.0;
-  } else {
-    double result = 10 * 1048576.0;
-    while (level > 1) {
-      result *= 10;
-      level--;
-    }
-    return result;
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  double result = 10 * 1048576.0;  // Result for both level-0 and level-1
+  while (level > 1) {
+    result *= 10;
+    level--;
   }
+  return result;
 }
 
 static uint64_t MaxFileSizeForLevel(int level) {
@@ -327,6 +325,9 @@ VersionSet::VersionSet(const std::string& dbname,
       icmp_(*cmp),
       next_file_number_(2),
       manifest_file_number_(0),  // Filled by Recover()
+      last_sequence_(0),
+      log_number_(0),
+      prev_log_number_(0),
       descriptor_file_(NULL),
       descriptor_log_(NULL),
       current_(new Version(this)),
@@ -345,7 +346,19 @@ VersionSet::~VersionSet() {
 }
 
 Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
+  if (edit->has_log_number_) {
+    assert(edit->log_number_ >= log_number_);
+    assert(edit->log_number_ < next_file_number_);
+  } else {
+    edit->SetLogNumber(log_number_);
+  }
+
+  if (!edit->has_prev_log_number_) {
+    edit->SetPrevLogNumber(prev_log_number_);
+  }
+
   edit->SetNextFile(next_file_number_);
+  edit->SetLastSequence(last_sequence_);
 
   Version* v = new Version(this);
   {
@@ -372,7 +385,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
     }
   }
 
-  // Write new record to log file
+  // Write new record to MANIFEST log
   if (s.ok()) {
     std::string record;
     edit->EncodeTo(&record);
@@ -396,6 +409,8 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
     v->next_ = NULL;
     current_->next_ = v;
     current_ = v;
+    log_number_ = edit->log_number_;
+    prev_log_number_ = edit->prev_log_number_;
   } else {
     delete v;
     if (!new_manifest_file.empty()) {
@@ -406,13 +421,11 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
       env_->DeleteFile(new_manifest_file);
     }
   }
-  //Log(env_, options_->info_log, "State\n%s", current_->DebugString().c_str());
 
   return s;
 }
 
-Status VersionSet::Recover(uint64_t* log_number,
-                           SequenceNumber* last_sequence) {
+Status VersionSet::Recover() {
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
     virtual void Corruption(size_t bytes, const Status& s) {
@@ -439,9 +452,13 @@ Status VersionSet::Recover(uint64_t* log_number,
   }
 
   bool have_log_number = false;
+  bool have_prev_log_number = false;
   bool have_next_file = false;
   bool have_last_sequence = false;
   uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t log_number = 0;
+  uint64_t prev_log_number = 0;
   Builder builder(this, current_);
 
   {
@@ -467,17 +484,22 @@ Status VersionSet::Recover(uint64_t* log_number,
       }
 
       if (edit.has_log_number_) {
-        *log_number = edit.log_number_;
+        log_number = edit.log_number_;
         have_log_number = true;
       }
 
+      if (edit.has_prev_log_number_) {
+        prev_log_number = edit.prev_log_number_;
+        have_prev_log_number = true;
+      }
+
       if (edit.has_next_file_number_) {
         next_file = edit.next_file_number_;
         have_next_file = true;
       }
 
       if (edit.has_last_sequence_) {
-        *last_sequence = edit.last_sequence_;
+        last_sequence = edit.last_sequence_;
         have_last_sequence = true;
       }
     }
@@ -493,6 +515,10 @@ Status VersionSet::Recover(uint64_t* log_number,
     } else if (!have_last_sequence) {
       s = Status::Corruption("no last-sequence-number entry in descriptor");
     }
+
+    if (!have_prev_log_number) {
+      prev_log_number = 0;
+    }
   }
 
   if (s.ok()) {
@@ -508,12 +534,23 @@ Status VersionSet::Recover(uint64_t* log_number,
       current_ = v;
       manifest_file_number_ = next_file;
       next_file_number_ = next_file + 1;
+      last_sequence_ = last_sequence;
+      log_number_ = log_number;
+      prev_log_number_ = prev_log_number;
     }
   }
 
   return s;
 }
 
+static int64_t TotalFileSize(const std::vector& files) {
+  int64_t sum = 0;
+  for (int i = 0; i < files.size(); i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
 Status VersionSet::Finalize(Version* v) {
   // Precomputed best level for next compaction
   int best_level = -1;
@@ -523,23 +560,24 @@ Status VersionSet::Finalize(Version* v) {
   for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
     s = SortLevel(v, level);
 
-    // Compute the ratio of current size to size limit.
-    uint64_t level_bytes = 0;
-    for (int i = 0; i < v->files_[level].size(); i++) {
-      level_bytes += v->files_[level][i]->file_size;
-    }
-    double score = static_cast(level_bytes) / MaxBytesForLevel(level);
-
+    double score;
     if (level == 0) {
-      // Level-0 file sizes are going to be often much smaller than
-      // MaxBytesForLevel(0) since we do not account for compression
-      // when producing a level-0 file; and too many level-0 files
-      // increase merging costs.  So use a file-count limit for
-      // level-0 in addition to the byte-count limit.
-      double count_score = v->files_[level].size() / 4.0;
-      if (count_score > score) {
-        score = count_score;
-      }
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      score = v->files_[level].size() / 4.0;
+    } else {
+      // Compute the ratio of current size to size limit.
+      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
+      score = static_cast(level_bytes) / MaxBytesForLevel(level);
     }
 
     if (score > best_score) {
@@ -696,8 +734,7 @@ bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref,
   return is_first;
 }
 
-void VersionSet::CleanupLargeValueRefs(const std::set& live_tables,
-                                       uint64_t log_file_num) {
+void VersionSet::CleanupLargeValueRefs(const std::set& live_tables) {
   for (LargeValueMap::iterator it = large_value_refs_.begin();
        it != large_value_refs_.end();
        ) {
@@ -705,7 +742,8 @@ void VersionSet::CleanupLargeValueRefs(const std::set& live_tables,
     for (LargeReferencesSet::iterator ref_it = refs->begin();
          ref_it != refs->end();
          ) {
-      if (ref_it->first != log_file_num &&              // Not in log file
+      if (ref_it->first != log_number_ &&               // Not in log file
+          ref_it->first != prev_log_number_ &&          // Not in prev log
           live_tables.count(ref_it->first) == 0) {      // Not in a live table
         // No longer live: erase
         LargeReferencesSet::iterator to_erase = ref_it;
@@ -762,12 +800,10 @@ void VersionSet::AddLiveFiles(std::set* live) {
   }
 }
 
-static int64_t TotalFileSize(const std::vector& files) {
-  int64_t sum = 0;
-  for (int i = 0; i < files.size(); i++) {
-    sum += files[i]->file_size;
-  }
-  return sum;
+int64_t VersionSet::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < config::kNumLevels);
+  return TotalFileSize(current_->files_[level]);
 }
 
 int64_t VersionSet::MaxNextLevelOverlappingBytes() {
diff --git a/db/version_set.h b/db/version_set.h
old mode 100644
new mode 100755
index a4199be..e1c5a4b
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -24,12 +24,6 @@
 
 namespace leveldb {
 
-// Grouping of constants.  We may want to make some of these
-// parameters set via options.
-namespace config {
-static const int kNumLevels = 7;
-}
-
 namespace log { class Writer; }
 
 class Compaction;
@@ -107,7 +101,7 @@ class VersionSet {
   Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
 
   // Recover the last saved descriptor from persistent storage.
-  Status Recover(uint64_t* log_number, SequenceNumber* last_sequence);
+  Status Recover();
 
   // Save current contents to *log
   Status WriteSnapshot(log::Writer* log);
@@ -124,6 +118,25 @@ class VersionSet {
   // Return the number of Table files at the specified level.
   int NumLevelFiles(int level) const;
 
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const { return last_sequence_; }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    last_sequence_ = s;
+  }
+
+  // Return the current log file number.
+  uint64_t LogNumber() const { return log_number_; }
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t PrevLogNumber() const { return prev_log_number_; }
+
   // Pick level and inputs for a new compaction.
   // Returns NULL if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
@@ -168,9 +181,8 @@ class VersionSet {
 
   // Cleanup the large value reference state by eliminating any
   // references from files that are not includes in either "live_tables"
-  // or "log_file".
-  void CleanupLargeValueRefs(const std::set& live_tables,
-                             uint64_t log_file_num);
+  // or the current log.
+  void CleanupLargeValueRefs(const std::set& live_tables);
 
   // Returns true if a large value with the given reference is live.
   bool LargeValueIsLive(const LargeValueRef& large_ref);
@@ -213,6 +225,9 @@ class VersionSet {
   const InternalKeyComparator icmp_;
   uint64_t next_file_number_;
   uint64_t manifest_file_number_;
+  uint64_t last_sequence_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
 
   // Opened lazily
   WritableFile* descriptor_file_;
diff --git a/db/write_batch.cc b/db/write_batch.cc
old mode 100644
new mode 100755
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
old mode 100644
new mode 100755
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
old mode 100644
new mode 100755
diff --git a/doc/doc.css b/doc/doc.css
old mode 100644
new mode 100755
diff --git a/doc/impl.html b/doc/impl.html
old mode 100644
new mode 100755
diff --git a/doc/index.html b/doc/index.html
old mode 100644
new mode 100755
index e0baf2e..2a83fc3
--- a/doc/index.html
+++ b/doc/index.html
@@ -63,15 +63,12 @@ Example:
 The database provides Put, Delete, and Get methods to
 modify/query the database.  For example, the following code
 moves the value stored under key1 to key2.
-

   std::string value;
   leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
   if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
   if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
 
-See important performance note below for how to -speed up writes significantly.

Atomic Updates

@@ -100,6 +97,47 @@ we do not end up erroneously dropping the value entirely. Apart from its atomicity benefits, WriteBatch may also be used to speed up bulk updates by placing lots of individual mutations into the same batch. + +

Synchronous Writes

+By default, each write to leveldb is asynchronous: it +returns after pushing the write from the process into the operating +system. The transfer from operating system memory to the underlying +persistent storage happens asynchronously. The sync flag +can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling +either fsync(...) or fdatasync(...) or +msync(..., MS_SYNC) before the write operation returns.) +
+  leveldb::WriteOptions write_options;
+  write_options.sync = true;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a thousand times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can often be used safely. For example, when +loading a large amount of data into the database you can handle lost +updates by restarting the bulk load after a crash. A hybrid scheme is +also possible where every Nth write is synchronous, and in the event +of a crash, the bulk load is restarted just after the last synchronous +write finished by the previous run. (The synchronous write can update +a marker that describes where to restart on a crash.) + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write (i.e., +write_options.sync is set to true). The extra cost of +the synchronous write will be amortized across all of the writes in +the batch. +

Concurrency

@@ -289,48 +327,12 @@ version numbers found in the keys to decide how to interpret them. Performance can be tuned by changing the default values of the types defined in leveldb/include/options.h. -

-

Asynchronous Writes

- -By default, each write to leveldb is synchronous: it does -not return until the write has been pushed from memory to persistent -storage. (On Posix systems, this is implemented by calling either -fdatasync(...) or msync(..., MS_SYNC).) -Synchronous writes may be very slow and the synchrony can be -optionally disabled: -
-  leveldb::WriteOptions write_options;
-  write_options.sync = false;
-  db->Put(write_options, ...);
-
-Asynchronous writes are often more than a hundred times as fast as -synchronous writes. The downside of asynchronous writes is that a -crash of the machine may cause the last few updates to be lost. Note -that a crash of just the writing process (i.e., not a reboot) will not -cause any loss since even when sync is false, an update -is pushed from the process memory into the operating system before it -is considered done. - -

-Asynchronous writes can be particularly beneficial when loading a -large amount of data into the database since you can mitigate the -problem of lost updates by restarting the bulk load. A hybrid scheme -is also possible where every Nth write is synchronous, and in the -event of a crash, the bulk load is restarted just after the last -synchronous write finished by the previous run. - -

-WriteBatch provides an alternative to asynchronous writes. -Multiple updates may be placed in the same WriteBatch and -applied together using a synchronous write. The extra cost of the -synchronous write will be amortized across all of the writes in the batch. -

Block size

leveldb groups adjacent keys together into the same block and such a block is the unit of transfer to and from persistent storage. The -default block size is approximately 8192 uncompressed bytes. +default block size is approximately 4096 uncompressed bytes. Applications that mostly do bulk scans over the contents of the database may wish to increase this size. Applications that do a lot of point reads of small values may wish to switch to a smaller block diff --git a/doc/log_format.txt b/doc/log_format.txt old mode 100644 new mode 100755 diff --git a/doc/table_format.txt b/doc/table_format.txt old mode 100644 new mode 100755 diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h old mode 100644 new mode 100755 diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h old mode 100644 new mode 100755 diff --git a/include/leveldb/db.h b/include/leveldb/db.h old mode 100644 new mode 100755 index 74d50d3..f18ded3 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -13,7 +13,7 @@ namespace leveldb { static const int kMajorVersion = 1; -static const int kMinorVersion = 0; +static const int kMinorVersion = 1; struct Options; struct ReadOptions; @@ -49,7 +49,7 @@ class DB { // Set the database entry for "key" to "value". Returns OK on success, // and a non-OK status on error. - // Note: consider setting options.sync = false. + // Note: consider setting options.sync = true. virtual Status Put(const WriteOptions& options, const Slice& key, const Slice& value) = 0; @@ -57,12 +57,12 @@ class DB { // Remove the database entry (if any) for "key". Returns OK on // success, and a non-OK status on error. It is not an error if "key" // did not exist in the database. - // Note: consider setting options.sync = false. + // Note: consider setting options.sync = true. virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; // Apply the specified updates to the database. // Returns OK on success, non-OK on failure. - // Note: consider setting options.sync = false. + // Note: consider setting options.sync = true. virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; // If the database contains an entry for "key" store the @@ -103,7 +103,9 @@ class DB { // // "leveldb.num-files-at-level" - return the number of files at level , // where is an ASCII representation of a level number (e.g. "0"). - virtual bool GetProperty(const Slice& property, uint64_t* value) = 0; + // "leveldb.stats" - returns a multi-line string that describes statistics + // about the internal operation of the DB. + virtual bool GetProperty(const Slice& property, std::string* value) = 0; // For each i in [0,n-1], store in "sizes[i]", the approximate // file system space used by keys in "[range[i].start .. range[i].limit)". diff --git a/include/leveldb/env.h b/include/leveldb/env.h old mode 100644 new mode 100755 diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h old mode 100644 new mode 100755 diff --git a/include/leveldb/options.h b/include/leveldb/options.h old mode 100644 new mode 100755 index 0b65624..87d388e --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -69,15 +69,14 @@ struct Options { // ------------------- // Parameters that affect performance - // Amount of data to build up in memory before converting to an - // on-disk file. + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. // - // Some DB operations may encounter a delay proportional to the size - // of this parameter. Therefore we recommend against increasing - // this parameter unless you are willing to live with an occasional - // slow operation in exchange for faster bulk loading throughput. + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. // - // Default: 1MB + // Default: 4MB size_t write_buffer_size; // Number of open files that can be used by the DB. You may need to @@ -100,7 +99,8 @@ struct Options { // Control over blocks (user data is stored in a set of blocks, and // a block is the unit of reading from disk). - // Use the specified cache for blocks (if non-NULL). + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. // Default: NULL Cache* block_cache; @@ -109,7 +109,7 @@ struct Options { // actual size of the unit read from disk may be smaller if // compression is enabled. This parameter can be changed dynamically. // - // Default: 8K + // Default: 4K int block_size; // Number of keys between restart points for delta encoding of keys. @@ -177,7 +177,12 @@ struct WriteOptions { // crashes (i.e., the machine does not reboot), no writes will be // lost even if sync==false. // - // Default: true + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fsync()". + // + // Default: false bool sync; // If "post_write_snapshot" is non-NULL, and the write succeeds, @@ -193,7 +198,7 @@ struct WriteOptions { const Snapshot** post_write_snapshot; WriteOptions() - : sync(true), + : sync(false), post_write_snapshot(NULL) { } }; diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h old mode 100644 new mode 100755 diff --git a/include/leveldb/status.h b/include/leveldb/status.h old mode 100644 new mode 100755 diff --git a/include/leveldb/table.h b/include/leveldb/table.h old mode 100644 new mode 100755 diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h old mode 100644 new mode 100755 diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h old mode 100644 new mode 100755 diff --git a/leveldb.gyp b/leveldb.gyp old mode 100644 new mode 100755 diff --git a/port/README b/port/README old mode 100644 new mode 100755 diff --git a/port/port.h b/port/port.h old mode 100644 new mode 100755 diff --git a/port/port_android.cc b/port/port_android.cc old mode 100644 new mode 100755 index 8a74111..240e9ca --- a/port/port_android.cc +++ b/port/port_android.cc @@ -24,7 +24,6 @@ int fdatasync(int fd) { } } -// TODO(gabor): This is copied from port_posix.cc - not sure if I should do this? namespace leveldb { namespace port { diff --git a/port/port_android.h b/port/port_android.h old mode 100644 new mode 100755 index ca0362d..8680951 --- a/port/port_android.h +++ b/port/port_android.h @@ -15,6 +15,20 @@ #include #include +// Collapse the plethora of ARM flavors available to an easier to manage set +// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto +#if defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7A__) +#define ARMV6_OR_7 1 +#endif + extern "C" { size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); @@ -61,28 +75,50 @@ class CondVar { pthread_cond_t cv_; }; +#ifndef ARMV6_OR_7 +// On ARM chipsets rep_; + void* rep_; + + inline void MemoryBarrier() const { + // TODO(gabor): This only works on Android instruction sets >= V6 +#ifdef ARMV6_OR_7 + __asm__ __volatile__("dmb" : : : "memory"); +#else + pLinuxKernelMemoryBarrier(); +#endif + } + public: AtomicPointer() { } explicit AtomicPointer(void* v) : rep_(v) { } inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); + void* r = rep_; + MemoryBarrier(); + return r; } inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); + MemoryBarrier(); + rep_ = v; } inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); + void* r = rep_; + return r; } inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); + rep_ = v; } }; -// TODO(gabor): Implement actual compress +// TODO(gabor): Implement compress inline bool Snappy_Compress( const char* input, size_t input_length, @@ -90,7 +126,7 @@ inline bool Snappy_Compress( return false; } -// TODO(gabor): Implement actual uncompress +// TODO(gabor): Implement uncompress inline bool Snappy_Uncompress( const char* input_data, size_t input_length, diff --git a/port/port_chromium.cc b/port/port_chromium.cc old mode 100644 new mode 100755 diff --git a/port/port_chromium.h b/port/port_chromium.h old mode 100644 new mode 100755 diff --git a/port/port_example.h b/port/port_example.h old mode 100644 new mode 100755 diff --git a/port/port_posix.cc b/port/port_posix.cc old mode 100644 new mode 100755 diff --git a/port/port_posix.h b/port/port_posix.h old mode 100644 new mode 100755 diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc old mode 100644 new mode 100755 diff --git a/port/sha1_portable.h b/port/sha1_portable.h old mode 100644 new mode 100755 diff --git a/port/sha1_test.cc b/port/sha1_test.cc old mode 100644 new mode 100755 diff --git a/port/win/stdint.h b/port/win/stdint.h old mode 100644 new mode 100755 diff --git a/table/block.cc b/table/block.cc old mode 100644 new mode 100755 diff --git a/table/block.h b/table/block.h old mode 100644 new mode 100755 diff --git a/table/block_builder.cc b/table/block_builder.cc old mode 100644 new mode 100755 diff --git a/table/block_builder.h b/table/block_builder.h old mode 100644 new mode 100755 diff --git a/table/format.cc b/table/format.cc old mode 100644 new mode 100755 diff --git a/table/format.h b/table/format.h old mode 100644 new mode 100755 diff --git a/table/iterator.cc b/table/iterator.cc old mode 100644 new mode 100755 diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h old mode 100644 new mode 100755 diff --git a/table/merger.cc b/table/merger.cc old mode 100644 new mode 100755 diff --git a/table/merger.h b/table/merger.h old mode 100644 new mode 100755 diff --git a/table/table.cc b/table/table.cc old mode 100644 new mode 100755 diff --git a/table/table_builder.cc b/table/table_builder.cc old mode 100644 new mode 100755 diff --git a/table/table_test.cc b/table/table_test.cc old mode 100644 new mode 100755 index e0c7134..4b3e85e --- a/table/table_test.cc +++ b/table/table_test.cc @@ -725,10 +725,10 @@ TEST(Harness, RandomizedLongDB) { Test(&rnd); // We must have created enough data to force merging - uint64_t l0_files, l1_files; + std::string l0_files, l1_files; ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); - ASSERT_GT(l0_files + l1_files, 0); + ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0); } diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc old mode 100644 new mode 100755 diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h old mode 100644 new mode 100755 diff --git a/util/arena.cc b/util/arena.cc old mode 100644 new mode 100755 diff --git a/util/arena.h b/util/arena.h old mode 100644 new mode 100755 diff --git a/util/arena_test.cc b/util/arena_test.cc old mode 100644 new mode 100755 diff --git a/util/cache.cc b/util/cache.cc old mode 100644 new mode 100755 diff --git a/util/cache_test.cc b/util/cache_test.cc old mode 100644 new mode 100755 diff --git a/util/coding.cc b/util/coding.cc old mode 100644 new mode 100755 diff --git a/util/coding.h b/util/coding.h old mode 100644 new mode 100755 diff --git a/util/coding_test.cc b/util/coding_test.cc old mode 100644 new mode 100755 diff --git a/util/comparator.cc b/util/comparator.cc old mode 100644 new mode 100755 diff --git a/util/crc32c.cc b/util/crc32c.cc old mode 100644 new mode 100755 diff --git a/util/crc32c.h b/util/crc32c.h old mode 100644 new mode 100755 diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc old mode 100644 new mode 100755 diff --git a/util/env.cc b/util/env.cc old mode 100644 new mode 100755 diff --git a/util/env_chromium.cc b/util/env_chromium.cc old mode 100644 new mode 100755 diff --git a/util/env_posix.cc b/util/env_posix.cc old mode 100644 new mode 100755 diff --git a/util/env_test.cc b/util/env_test.cc old mode 100644 new mode 100755 diff --git a/util/hash.cc b/util/hash.cc old mode 100644 new mode 100755 diff --git a/util/hash.h b/util/hash.h old mode 100644 new mode 100755 diff --git a/util/histogram.cc b/util/histogram.cc old mode 100644 new mode 100755 diff --git a/util/histogram.h b/util/histogram.h old mode 100644 new mode 100755 diff --git a/util/logging.cc b/util/logging.cc old mode 100644 new mode 100755 diff --git a/util/logging.h b/util/logging.h old mode 100644 new mode 100755 diff --git a/util/mutexlock.h b/util/mutexlock.h old mode 100644 new mode 100755 diff --git a/util/options.cc b/util/options.cc old mode 100644 new mode 100755 index 421608b..29272fe --- a/util/options.cc +++ b/util/options.cc @@ -16,11 +16,11 @@ Options::Options() paranoid_checks(false), env(Env::Default()), info_log(NULL), - write_buffer_size(1<<20), + write_buffer_size(4<<20), max_open_files(1000), large_value_threshold(65536), block_cache(NULL), - block_size(8192), + block_size(4096), block_restart_interval(16), compression(kSnappyCompression) { } diff --git a/util/random.h b/util/random.h old mode 100644 new mode 100755 diff --git a/util/status.cc b/util/status.cc old mode 100644 new mode 100755 diff --git a/util/testharness.cc b/util/testharness.cc old mode 100644 new mode 100755 diff --git a/util/testharness.h b/util/testharness.h old mode 100644 new mode 100755 diff --git a/util/testutil.cc b/util/testutil.cc old mode 100644 new mode 100755 diff --git a/util/testutil.h b/util/testutil.h old mode 100644 new mode 100755 -- cgit v1.2.3 From e5d186b89ea9c2a8b6ff1d687370e70787eea9d8 Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Mon, 18 Apr 2011 23:15:58 +0000 Subject: chmod a-x git-svn-id: http://leveldb.googlecode.com/svn/trunk@21 62dab493-f737-651d-591e-8d6aee1b9529 --- AUTHORS | 0 LICENSE | 0 Makefile | 0 README | 0 TODO | 0 db/builder.cc | 0 db/builder.h | 0 db/corruption_test.cc | 0 db/db_bench.cc | 0 db/db_impl.cc | 0 db/db_impl.h | 0 db/db_iter.cc | 0 db/db_iter.h | 0 db/db_test.cc | 0 db/dbformat.cc | 0 db/dbformat.h | 0 db/dbformat_test.cc | 0 db/filename.cc | 0 db/filename.h | 0 db/filename_test.cc | 0 db/log_format.h | 0 db/log_reader.cc | 0 db/log_reader.h | 0 db/log_test.cc | 0 db/log_writer.cc | 0 db/log_writer.h | 0 db/memtable.cc | 0 db/memtable.h | 0 db/repair.cc | 0 db/skiplist.h | 0 db/skiplist_test.cc | 0 db/snapshot.h | 0 db/table_cache.cc | 0 db/table_cache.h | 0 db/version_edit.cc | 0 db/version_edit.h | 0 db/version_edit_test.cc | 0 db/version_set.cc | 0 db/version_set.h | 0 db/write_batch.cc | 0 db/write_batch_internal.h | 0 db/write_batch_test.cc | 0 doc/doc.css | 0 doc/impl.html | 0 doc/index.html | 0 doc/log_format.txt | 0 doc/table_format.txt | 0 include/leveldb/cache.h | 0 include/leveldb/comparator.h | 0 include/leveldb/db.h | 0 include/leveldb/env.h | 0 include/leveldb/iterator.h | 0 include/leveldb/options.h | 0 include/leveldb/slice.h | 0 include/leveldb/status.h | 0 include/leveldb/table.h | 0 include/leveldb/table_builder.h | 0 include/leveldb/write_batch.h | 0 leveldb.gyp | 0 port/README | 0 port/port.h | 0 port/port_android.cc | 0 port/port_android.h | 0 port/port_chromium.cc | 0 port/port_chromium.h | 0 port/port_example.h | 0 port/port_posix.cc | 0 port/port_posix.h | 0 port/sha1_portable.cc | 0 port/sha1_portable.h | 0 port/sha1_test.cc | 0 port/win/stdint.h | 0 table/block.cc | 0 table/block.h | 0 table/block_builder.cc | 0 table/block_builder.h | 0 table/format.cc | 0 table/format.h | 0 table/iterator.cc | 0 table/iterator_wrapper.h | 0 table/merger.cc | 0 table/merger.h | 0 table/table.cc | 0 table/table_builder.cc | 0 table/table_test.cc | 0 table/two_level_iterator.cc | 0 table/two_level_iterator.h | 0 util/arena.cc | 0 util/arena.h | 0 util/arena_test.cc | 0 util/cache.cc | 0 util/cache_test.cc | 0 util/coding.cc | 0 util/coding.h | 0 util/coding_test.cc | 0 util/comparator.cc | 0 util/crc32c.cc | 0 util/crc32c.h | 0 util/crc32c_test.cc | 0 util/env.cc | 0 util/env_chromium.cc | 0 util/env_posix.cc | 0 util/env_test.cc | 0 util/hash.cc | 0 util/hash.h | 0 util/histogram.cc | 0 util/histogram.h | 0 util/logging.cc | 0 util/logging.h | 0 util/mutexlock.h | 0 util/options.cc | 0 util/random.h | 0 util/status.cc | 0 util/testharness.cc | 0 util/testharness.h | 0 util/testutil.cc | 0 util/testutil.h | 0 117 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 AUTHORS mode change 100755 => 100644 LICENSE mode change 100755 => 100644 Makefile mode change 100755 => 100644 README mode change 100755 => 100644 TODO mode change 100755 => 100644 db/builder.cc mode change 100755 => 100644 db/builder.h mode change 100755 => 100644 db/corruption_test.cc mode change 100755 => 100644 db/db_bench.cc mode change 100755 => 100644 db/db_impl.cc mode change 100755 => 100644 db/db_impl.h mode change 100755 => 100644 db/db_iter.cc mode change 100755 => 100644 db/db_iter.h mode change 100755 => 100644 db/db_test.cc mode change 100755 => 100644 db/dbformat.cc mode change 100755 => 100644 db/dbformat.h mode change 100755 => 100644 db/dbformat_test.cc mode change 100755 => 100644 db/filename.cc mode change 100755 => 100644 db/filename.h mode change 100755 => 100644 db/filename_test.cc mode change 100755 => 100644 db/log_format.h mode change 100755 => 100644 db/log_reader.cc mode change 100755 => 100644 db/log_reader.h mode change 100755 => 100644 db/log_test.cc mode change 100755 => 100644 db/log_writer.cc mode change 100755 => 100644 db/log_writer.h mode change 100755 => 100644 db/memtable.cc mode change 100755 => 100644 db/memtable.h mode change 100755 => 100644 db/repair.cc mode change 100755 => 100644 db/skiplist.h mode change 100755 => 100644 db/skiplist_test.cc mode change 100755 => 100644 db/snapshot.h mode change 100755 => 100644 db/table_cache.cc mode change 100755 => 100644 db/table_cache.h mode change 100755 => 100644 db/version_edit.cc mode change 100755 => 100644 db/version_edit.h mode change 100755 => 100644 db/version_edit_test.cc mode change 100755 => 100644 db/version_set.cc mode change 100755 => 100644 db/version_set.h mode change 100755 => 100644 db/write_batch.cc mode change 100755 => 100644 db/write_batch_internal.h mode change 100755 => 100644 db/write_batch_test.cc mode change 100755 => 100644 doc/doc.css mode change 100755 => 100644 doc/impl.html mode change 100755 => 100644 doc/index.html mode change 100755 => 100644 doc/log_format.txt mode change 100755 => 100644 doc/table_format.txt mode change 100755 => 100644 include/leveldb/cache.h mode change 100755 => 100644 include/leveldb/comparator.h mode change 100755 => 100644 include/leveldb/db.h mode change 100755 => 100644 include/leveldb/env.h mode change 100755 => 100644 include/leveldb/iterator.h mode change 100755 => 100644 include/leveldb/options.h mode change 100755 => 100644 include/leveldb/slice.h mode change 100755 => 100644 include/leveldb/status.h mode change 100755 => 100644 include/leveldb/table.h mode change 100755 => 100644 include/leveldb/table_builder.h mode change 100755 => 100644 include/leveldb/write_batch.h mode change 100755 => 100644 leveldb.gyp mode change 100755 => 100644 port/README mode change 100755 => 100644 port/port.h mode change 100755 => 100644 port/port_android.cc mode change 100755 => 100644 port/port_android.h mode change 100755 => 100644 port/port_chromium.cc mode change 100755 => 100644 port/port_chromium.h mode change 100755 => 100644 port/port_example.h mode change 100755 => 100644 port/port_posix.cc mode change 100755 => 100644 port/port_posix.h mode change 100755 => 100644 port/sha1_portable.cc mode change 100755 => 100644 port/sha1_portable.h mode change 100755 => 100644 port/sha1_test.cc mode change 100755 => 100644 port/win/stdint.h mode change 100755 => 100644 table/block.cc mode change 100755 => 100644 table/block.h mode change 100755 => 100644 table/block_builder.cc mode change 100755 => 100644 table/block_builder.h mode change 100755 => 100644 table/format.cc mode change 100755 => 100644 table/format.h mode change 100755 => 100644 table/iterator.cc mode change 100755 => 100644 table/iterator_wrapper.h mode change 100755 => 100644 table/merger.cc mode change 100755 => 100644 table/merger.h mode change 100755 => 100644 table/table.cc mode change 100755 => 100644 table/table_builder.cc mode change 100755 => 100644 table/table_test.cc mode change 100755 => 100644 table/two_level_iterator.cc mode change 100755 => 100644 table/two_level_iterator.h mode change 100755 => 100644 util/arena.cc mode change 100755 => 100644 util/arena.h mode change 100755 => 100644 util/arena_test.cc mode change 100755 => 100644 util/cache.cc mode change 100755 => 100644 util/cache_test.cc mode change 100755 => 100644 util/coding.cc mode change 100755 => 100644 util/coding.h mode change 100755 => 100644 util/coding_test.cc mode change 100755 => 100644 util/comparator.cc mode change 100755 => 100644 util/crc32c.cc mode change 100755 => 100644 util/crc32c.h mode change 100755 => 100644 util/crc32c_test.cc mode change 100755 => 100644 util/env.cc mode change 100755 => 100644 util/env_chromium.cc mode change 100755 => 100644 util/env_posix.cc mode change 100755 => 100644 util/env_test.cc mode change 100755 => 100644 util/hash.cc mode change 100755 => 100644 util/hash.h mode change 100755 => 100644 util/histogram.cc mode change 100755 => 100644 util/histogram.h mode change 100755 => 100644 util/logging.cc mode change 100755 => 100644 util/logging.h mode change 100755 => 100644 util/mutexlock.h mode change 100755 => 100644 util/options.cc mode change 100755 => 100644 util/random.h mode change 100755 => 100644 util/status.cc mode change 100755 => 100644 util/testharness.cc mode change 100755 => 100644 util/testharness.h mode change 100755 => 100644 util/testutil.cc mode change 100755 => 100644 util/testutil.h diff --git a/AUTHORS b/AUTHORS old mode 100755 new mode 100644 diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/Makefile b/Makefile old mode 100755 new mode 100644 diff --git a/README b/README old mode 100755 new mode 100644 diff --git a/TODO b/TODO old mode 100755 new mode 100644 diff --git a/db/builder.cc b/db/builder.cc old mode 100755 new mode 100644 diff --git a/db/builder.h b/db/builder.h old mode 100755 new mode 100644 diff --git a/db/corruption_test.cc b/db/corruption_test.cc old mode 100755 new mode 100644 diff --git a/db/db_bench.cc b/db/db_bench.cc old mode 100755 new mode 100644 diff --git a/db/db_impl.cc b/db/db_impl.cc old mode 100755 new mode 100644 diff --git a/db/db_impl.h b/db/db_impl.h old mode 100755 new mode 100644 diff --git a/db/db_iter.cc b/db/db_iter.cc old mode 100755 new mode 100644 diff --git a/db/db_iter.h b/db/db_iter.h old mode 100755 new mode 100644 diff --git a/db/db_test.cc b/db/db_test.cc old mode 100755 new mode 100644 diff --git a/db/dbformat.cc b/db/dbformat.cc old mode 100755 new mode 100644 diff --git a/db/dbformat.h b/db/dbformat.h old mode 100755 new mode 100644 diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc old mode 100755 new mode 100644 diff --git a/db/filename.cc b/db/filename.cc old mode 100755 new mode 100644 diff --git a/db/filename.h b/db/filename.h old mode 100755 new mode 100644 diff --git a/db/filename_test.cc b/db/filename_test.cc old mode 100755 new mode 100644 diff --git a/db/log_format.h b/db/log_format.h old mode 100755 new mode 100644 diff --git a/db/log_reader.cc b/db/log_reader.cc old mode 100755 new mode 100644 diff --git a/db/log_reader.h b/db/log_reader.h old mode 100755 new mode 100644 diff --git a/db/log_test.cc b/db/log_test.cc old mode 100755 new mode 100644 diff --git a/db/log_writer.cc b/db/log_writer.cc old mode 100755 new mode 100644 diff --git a/db/log_writer.h b/db/log_writer.h old mode 100755 new mode 100644 diff --git a/db/memtable.cc b/db/memtable.cc old mode 100755 new mode 100644 diff --git a/db/memtable.h b/db/memtable.h old mode 100755 new mode 100644 diff --git a/db/repair.cc b/db/repair.cc old mode 100755 new mode 100644 diff --git a/db/skiplist.h b/db/skiplist.h old mode 100755 new mode 100644 diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc old mode 100755 new mode 100644 diff --git a/db/snapshot.h b/db/snapshot.h old mode 100755 new mode 100644 diff --git a/db/table_cache.cc b/db/table_cache.cc old mode 100755 new mode 100644 diff --git a/db/table_cache.h b/db/table_cache.h old mode 100755 new mode 100644 diff --git a/db/version_edit.cc b/db/version_edit.cc old mode 100755 new mode 100644 diff --git a/db/version_edit.h b/db/version_edit.h old mode 100755 new mode 100644 diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc old mode 100755 new mode 100644 diff --git a/db/version_set.cc b/db/version_set.cc old mode 100755 new mode 100644 diff --git a/db/version_set.h b/db/version_set.h old mode 100755 new mode 100644 diff --git a/db/write_batch.cc b/db/write_batch.cc old mode 100755 new mode 100644 diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h old mode 100755 new mode 100644 diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc old mode 100755 new mode 100644 diff --git a/doc/doc.css b/doc/doc.css old mode 100755 new mode 100644 diff --git a/doc/impl.html b/doc/impl.html old mode 100755 new mode 100644 diff --git a/doc/index.html b/doc/index.html old mode 100755 new mode 100644 diff --git a/doc/log_format.txt b/doc/log_format.txt old mode 100755 new mode 100644 diff --git a/doc/table_format.txt b/doc/table_format.txt old mode 100755 new mode 100644 diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h old mode 100755 new mode 100644 diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h old mode 100755 new mode 100644 diff --git a/include/leveldb/db.h b/include/leveldb/db.h old mode 100755 new mode 100644 diff --git a/include/leveldb/env.h b/include/leveldb/env.h old mode 100755 new mode 100644 diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h old mode 100755 new mode 100644 diff --git a/include/leveldb/options.h b/include/leveldb/options.h old mode 100755 new mode 100644 diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h old mode 100755 new mode 100644 diff --git a/include/leveldb/status.h b/include/leveldb/status.h old mode 100755 new mode 100644 diff --git a/include/leveldb/table.h b/include/leveldb/table.h old mode 100755 new mode 100644 diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h old mode 100755 new mode 100644 diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h old mode 100755 new mode 100644 diff --git a/leveldb.gyp b/leveldb.gyp old mode 100755 new mode 100644 diff --git a/port/README b/port/README old mode 100755 new mode 100644 diff --git a/port/port.h b/port/port.h old mode 100755 new mode 100644 diff --git a/port/port_android.cc b/port/port_android.cc old mode 100755 new mode 100644 diff --git a/port/port_android.h b/port/port_android.h old mode 100755 new mode 100644 diff --git a/port/port_chromium.cc b/port/port_chromium.cc old mode 100755 new mode 100644 diff --git a/port/port_chromium.h b/port/port_chromium.h old mode 100755 new mode 100644 diff --git a/port/port_example.h b/port/port_example.h old mode 100755 new mode 100644 diff --git a/port/port_posix.cc b/port/port_posix.cc old mode 100755 new mode 100644 diff --git a/port/port_posix.h b/port/port_posix.h old mode 100755 new mode 100644 diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc old mode 100755 new mode 100644 diff --git a/port/sha1_portable.h b/port/sha1_portable.h old mode 100755 new mode 100644 diff --git a/port/sha1_test.cc b/port/sha1_test.cc old mode 100755 new mode 100644 diff --git a/port/win/stdint.h b/port/win/stdint.h old mode 100755 new mode 100644 diff --git a/table/block.cc b/table/block.cc old mode 100755 new mode 100644 diff --git a/table/block.h b/table/block.h old mode 100755 new mode 100644 diff --git a/table/block_builder.cc b/table/block_builder.cc old mode 100755 new mode 100644 diff --git a/table/block_builder.h b/table/block_builder.h old mode 100755 new mode 100644 diff --git a/table/format.cc b/table/format.cc old mode 100755 new mode 100644 diff --git a/table/format.h b/table/format.h old mode 100755 new mode 100644 diff --git a/table/iterator.cc b/table/iterator.cc old mode 100755 new mode 100644 diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h old mode 100755 new mode 100644 diff --git a/table/merger.cc b/table/merger.cc old mode 100755 new mode 100644 diff --git a/table/merger.h b/table/merger.h old mode 100755 new mode 100644 diff --git a/table/table.cc b/table/table.cc old mode 100755 new mode 100644 diff --git a/table/table_builder.cc b/table/table_builder.cc old mode 100755 new mode 100644 diff --git a/table/table_test.cc b/table/table_test.cc old mode 100755 new mode 100644 diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc old mode 100755 new mode 100644 diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h old mode 100755 new mode 100644 diff --git a/util/arena.cc b/util/arena.cc old mode 100755 new mode 100644 diff --git a/util/arena.h b/util/arena.h old mode 100755 new mode 100644 diff --git a/util/arena_test.cc b/util/arena_test.cc old mode 100755 new mode 100644 diff --git a/util/cache.cc b/util/cache.cc old mode 100755 new mode 100644 diff --git a/util/cache_test.cc b/util/cache_test.cc old mode 100755 new mode 100644 diff --git a/util/coding.cc b/util/coding.cc old mode 100755 new mode 100644 diff --git a/util/coding.h b/util/coding.h old mode 100755 new mode 100644 diff --git a/util/coding_test.cc b/util/coding_test.cc old mode 100755 new mode 100644 diff --git a/util/comparator.cc b/util/comparator.cc old mode 100755 new mode 100644 diff --git a/util/crc32c.cc b/util/crc32c.cc old mode 100755 new mode 100644 diff --git a/util/crc32c.h b/util/crc32c.h old mode 100755 new mode 100644 diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc old mode 100755 new mode 100644 diff --git a/util/env.cc b/util/env.cc old mode 100755 new mode 100644 diff --git a/util/env_chromium.cc b/util/env_chromium.cc old mode 100755 new mode 100644 diff --git a/util/env_posix.cc b/util/env_posix.cc old mode 100755 new mode 100644 diff --git a/util/env_test.cc b/util/env_test.cc old mode 100755 new mode 100644 diff --git a/util/hash.cc b/util/hash.cc old mode 100755 new mode 100644 diff --git a/util/hash.h b/util/hash.h old mode 100755 new mode 100644 diff --git a/util/histogram.cc b/util/histogram.cc old mode 100755 new mode 100644 diff --git a/util/histogram.h b/util/histogram.h old mode 100755 new mode 100644 diff --git a/util/logging.cc b/util/logging.cc old mode 100755 new mode 100644 diff --git a/util/logging.h b/util/logging.h old mode 100755 new mode 100644 diff --git a/util/mutexlock.h b/util/mutexlock.h old mode 100755 new mode 100644 diff --git a/util/options.cc b/util/options.cc old mode 100755 new mode 100644 diff --git a/util/random.h b/util/random.h old mode 100755 new mode 100644 diff --git a/util/status.cc b/util/status.cc old mode 100755 new mode 100644 diff --git a/util/testharness.cc b/util/testharness.cc old mode 100755 new mode 100644 diff --git a/util/testharness.h b/util/testharness.h old mode 100755 new mode 100644 diff --git a/util/testutil.cc b/util/testutil.cc old mode 100755 new mode 100644 diff --git a/util/testutil.h b/util/testutil.h old mode 100755 new mode 100644 -- cgit v1.2.3 From 8540066c2705c6d2a0595d468a6ab041e4fc72fa Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Tue, 19 Apr 2011 23:01:25 +0000 Subject: Revision created by MOE tool push_codebase. MOE_MIGRATION= git-svn-id: http://leveldb.googlecode.com/svn/trunk@22 62dab493-f737-651d-591e-8d6aee1b9529 --- AUTHORS | 8 - LICENSE | 27 - Makefile | 134 --- README | 51 -- TODO | 14 - db/builder.cc | 99 --- db/builder.h | 36 - db/corruption_test.cc | 378 --------- db/db_bench.cc | 635 --------------- db/db_impl.cc | 1345 ------------------------------- db/db_impl.h | 207 ----- db/db_iter.cc | 397 --------- db/db_iter.h | 26 - db/db_test.cc | 1211 ---------------------------- db/dbformat.cc | 152 ---- db/dbformat.h | 204 ----- db/dbformat_test.cc | 127 --- db/filename.cc | 154 ---- db/filename.h | 92 --- db/filename_test.cc | 156 ---- db/log_format.h | 35 - db/log_reader.cc | 176 ---- db/log_reader.h | 75 -- db/log_test.cc | 361 --------- db/log_writer.cc | 102 --- db/log_writer.h | 48 -- db/memtable.cc | 109 --- db/memtable.h | 69 -- db/repair.cc | 396 --------- db/skiplist.h | 378 --------- db/skiplist_test.cc | 378 --------- db/snapshot.h | 66 -- db/table_cache.cc | 95 --- db/table_cache.h | 50 -- db/version_edit.cc | 301 ------- db/version_edit.h | 124 --- db/version_edit_test.cc | 50 -- db/version_set.cc | 1120 ------------------------- db/version_set.h | 332 -------- db/write_batch.cc | 164 ---- db/write_batch_internal.h | 73 -- db/write_batch_test.cc | 110 --- doc/doc.css | 89 -- doc/impl.html | 228 ------ doc/index.html | 509 ------------ doc/log_format.txt | 75 -- doc/table_format.txt | 61 -- include/leveldb/cache.h | 99 --- include/leveldb/comparator.h | 61 -- include/leveldb/db.h | 142 ---- include/leveldb/env.h | 290 ------- include/leveldb/iterator.h | 95 --- include/leveldb/options.h | 208 ----- include/leveldb/slice.h | 104 --- include/leveldb/status.h | 86 -- include/leveldb/table.h | 69 -- include/leveldb/table_builder.h | 86 -- include/leveldb/write_batch.h | 49 -- leveldb.gyp | 327 -------- leveldb/AUTHORS | 8 + leveldb/LICENSE | 27 + leveldb/Makefile | 129 +++ leveldb/README | 51 ++ leveldb/TODO | 14 + leveldb/db/builder.cc | 90 +++ leveldb/db/builder.h | 36 + leveldb/db/corruption_test.cc | 354 ++++++++ leveldb/db/db_bench.cc | 613 ++++++++++++++ leveldb/db/db_impl.cc | 1188 +++++++++++++++++++++++++++ leveldb/db/db_impl.h | 184 +++++ leveldb/db/db_iter.cc | 298 +++++++ leveldb/db/db_iter.h | 26 + leveldb/db/db_test.cc | 1030 +++++++++++++++++++++++ leveldb/db/dbformat.cc | 87 ++ leveldb/db/dbformat.h | 155 ++++ leveldb/db/dbformat_test.cc | 112 +++ leveldb/db/filename.cc | 135 ++++ leveldb/db/filename.h | 80 ++ leveldb/db/filename_test.cc | 122 +++ leveldb/db/log_format.h | 35 + leveldb/db/log_reader.cc | 176 ++++ leveldb/db/log_reader.h | 75 ++ leveldb/db/log_test.cc | 361 +++++++++ leveldb/db/log_writer.cc | 102 +++ leveldb/db/log_writer.h | 48 ++ leveldb/db/memtable.cc | 109 +++ leveldb/db/memtable.h | 69 ++ leveldb/db/repair.cc | 380 +++++++++ leveldb/db/skiplist.h | 378 +++++++++ leveldb/db/skiplist_test.cc | 378 +++++++++ leveldb/db/snapshot.h | 66 ++ leveldb/db/table_cache.cc | 95 +++ leveldb/db/table_cache.h | 50 ++ leveldb/db/version_edit.cc | 268 ++++++ leveldb/db/version_edit.h | 106 +++ leveldb/db/version_edit_test.cc | 46 ++ leveldb/db/version_set.cc | 1027 +++++++++++++++++++++++ leveldb/db/version_set.h | 308 +++++++ leveldb/db/write_batch.cc | 148 ++++ leveldb/db/write_batch_internal.h | 69 ++ leveldb/db/write_batch_test.cc | 87 ++ leveldb/doc/doc.css | 89 ++ leveldb/doc/impl.html | 217 +++++ leveldb/doc/index.html | 498 ++++++++++++ leveldb/doc/log_format.txt | 75 ++ leveldb/doc/table_format.txt | 61 ++ leveldb/include/leveldb/cache.h | 99 +++ leveldb/include/leveldb/comparator.h | 61 ++ leveldb/include/leveldb/db.h | 142 ++++ leveldb/include/leveldb/env.h | 290 +++++++ leveldb/include/leveldb/iterator.h | 95 +++ leveldb/include/leveldb/options.h | 198 +++++ leveldb/include/leveldb/slice.h | 104 +++ leveldb/include/leveldb/status.h | 86 ++ leveldb/include/leveldb/table.h | 69 ++ leveldb/include/leveldb/table_builder.h | 86 ++ leveldb/include/leveldb/write_batch.h | 49 ++ leveldb/leveldb.gyp | 315 ++++++++ leveldb/port/README | 10 + leveldb/port/port.h | 21 + leveldb/port/port_android.cc | 64 ++ leveldb/port/port_android.h | 150 ++++ leveldb/port/port_chromium.cc | 80 ++ leveldb/port/port_chromium.h | 97 +++ leveldb/port/port_example.h | 115 +++ leveldb/port/port_posix.cc | 50 ++ leveldb/port/port_posix.h | 94 +++ leveldb/port/win/stdint.h | 24 + leveldb/table/block.cc | 263 ++++++ leveldb/table/block.h | 43 + leveldb/table/block_builder.cc | 109 +++ leveldb/table/block_builder.h | 57 ++ leveldb/table/format.cc | 131 +++ leveldb/table/format.h | 103 +++ leveldb/table/iterator.cc | 68 ++ leveldb/table/iterator_wrapper.h | 64 ++ leveldb/table/merger.cc | 197 +++++ leveldb/table/merger.h | 26 + leveldb/table/table.cc | 175 ++++ leveldb/table/table_builder.cc | 227 ++++++ leveldb/table/table_test.cc | 841 +++++++++++++++++++ leveldb/table/two_level_iterator.cc | 182 +++++ leveldb/table/two_level_iterator.h | 34 + leveldb/util/arena.cc | 68 ++ leveldb/util/arena.h | 68 ++ leveldb/util/arena_test.cc | 68 ++ leveldb/util/cache.cc | 253 ++++++ leveldb/util/cache_test.cc | 169 ++++ leveldb/util/coding.cc | 194 +++++ leveldb/util/coding.h | 104 +++ leveldb/util/coding_test.cc | 173 ++++ leveldb/util/comparator.cc | 72 ++ leveldb/util/crc32c.cc | 332 ++++++++ leveldb/util/crc32c.h | 45 ++ leveldb/util/crc32c_test.cc | 72 ++ leveldb/util/env.cc | 77 ++ leveldb/util/env_chromium.cc | 603 ++++++++++++++ leveldb/util/env_posix.cc | 599 ++++++++++++++ leveldb/util/env_test.cc | 102 +++ leveldb/util/hash.cc | 45 ++ leveldb/util/hash.h | 19 + leveldb/util/histogram.cc | 128 +++ leveldb/util/histogram.h | 41 + leveldb/util/logging.cc | 81 ++ leveldb/util/logging.h | 47 ++ leveldb/util/mutexlock.h | 39 + leveldb/util/options.cc | 28 + leveldb/util/random.h | 59 ++ leveldb/util/status.cc | 59 ++ leveldb/util/testharness.cc | 65 ++ leveldb/util/testharness.h | 129 +++ leveldb/util/testutil.cc | 51 ++ leveldb/util/testutil.h | 53 ++ port/README | 10 - port/port.h | 21 - port/port_android.cc | 64 -- port/port_android.h | 158 ---- port/port_chromium.cc | 80 -- port/port_chromium.h | 104 --- port/port_example.h | 120 --- port/port_posix.cc | 50 -- port/port_posix.h | 99 --- port/sha1_portable.cc | 298 ------- port/sha1_portable.h | 25 - port/sha1_test.cc | 39 - port/win/stdint.h | 24 - table/block.cc | 261 ------ table/block.h | 43 - table/block_builder.cc | 109 --- table/block_builder.h | 57 -- table/format.cc | 131 --- table/format.h | 103 --- table/iterator.cc | 68 -- table/iterator_wrapper.h | 64 -- table/merger.cc | 197 ----- table/merger.h | 26 - table/table.cc | 175 ---- table/table_builder.cc | 227 ------ table/table_test.cc | 841 ------------------- table/two_level_iterator.cc | 182 ----- table/two_level_iterator.h | 34 - util/arena.cc | 68 -- util/arena.h | 68 -- util/arena_test.cc | 68 -- util/cache.cc | 253 ------ util/cache_test.cc | 169 ---- util/coding.cc | 194 ----- util/coding.h | 104 --- util/coding_test.cc | 173 ---- util/comparator.cc | 72 -- util/crc32c.cc | 332 -------- util/crc32c.h | 45 -- util/crc32c_test.cc | 72 -- util/env.cc | 77 -- util/env_chromium.cc | 603 -------------- util/env_posix.cc | 599 -------------- util/env_test.cc | 102 --- util/hash.cc | 45 -- util/hash.h | 19 - util/histogram.cc | 128 --- util/histogram.h | 41 - util/logging.cc | 81 -- util/logging.h | 47 -- util/mutexlock.h | 39 - util/options.cc | 29 - util/random.h | 59 -- util/status.cc | 59 -- util/testharness.cc | 65 -- util/testharness.h | 129 --- util/testutil.cc | 51 -- util/testutil.h | 53 -- 231 files changed, 18722 insertions(+), 20097 deletions(-) delete mode 100644 AUTHORS delete mode 100644 LICENSE delete mode 100644 Makefile delete mode 100644 README delete mode 100644 TODO delete mode 100644 db/builder.cc delete mode 100644 db/builder.h delete mode 100644 db/corruption_test.cc delete mode 100644 db/db_bench.cc delete mode 100644 db/db_impl.cc delete mode 100644 db/db_impl.h delete mode 100644 db/db_iter.cc delete mode 100644 db/db_iter.h delete mode 100644 db/db_test.cc delete mode 100644 db/dbformat.cc delete mode 100644 db/dbformat.h delete mode 100644 db/dbformat_test.cc delete mode 100644 db/filename.cc delete mode 100644 db/filename.h delete mode 100644 db/filename_test.cc delete mode 100644 db/log_format.h delete mode 100644 db/log_reader.cc delete mode 100644 db/log_reader.h delete mode 100644 db/log_test.cc delete mode 100644 db/log_writer.cc delete mode 100644 db/log_writer.h delete mode 100644 db/memtable.cc delete mode 100644 db/memtable.h delete mode 100644 db/repair.cc delete mode 100644 db/skiplist.h delete mode 100644 db/skiplist_test.cc delete mode 100644 db/snapshot.h delete mode 100644 db/table_cache.cc delete mode 100644 db/table_cache.h delete mode 100644 db/version_edit.cc delete mode 100644 db/version_edit.h delete mode 100644 db/version_edit_test.cc delete mode 100644 db/version_set.cc delete mode 100644 db/version_set.h delete mode 100644 db/write_batch.cc delete mode 100644 db/write_batch_internal.h delete mode 100644 db/write_batch_test.cc delete mode 100644 doc/doc.css delete mode 100644 doc/impl.html delete mode 100644 doc/index.html delete mode 100644 doc/log_format.txt delete mode 100644 doc/table_format.txt delete mode 100644 include/leveldb/cache.h delete mode 100644 include/leveldb/comparator.h delete mode 100644 include/leveldb/db.h delete mode 100644 include/leveldb/env.h delete mode 100644 include/leveldb/iterator.h delete mode 100644 include/leveldb/options.h delete mode 100644 include/leveldb/slice.h delete mode 100644 include/leveldb/status.h delete mode 100644 include/leveldb/table.h delete mode 100644 include/leveldb/table_builder.h delete mode 100644 include/leveldb/write_batch.h delete mode 100644 leveldb.gyp create mode 100644 leveldb/AUTHORS create mode 100644 leveldb/LICENSE create mode 100644 leveldb/Makefile create mode 100644 leveldb/README create mode 100644 leveldb/TODO create mode 100644 leveldb/db/builder.cc create mode 100644 leveldb/db/builder.h create mode 100644 leveldb/db/corruption_test.cc create mode 100644 leveldb/db/db_bench.cc create mode 100644 leveldb/db/db_impl.cc create mode 100644 leveldb/db/db_impl.h create mode 100644 leveldb/db/db_iter.cc create mode 100644 leveldb/db/db_iter.h create mode 100644 leveldb/db/db_test.cc create mode 100644 leveldb/db/dbformat.cc create mode 100644 leveldb/db/dbformat.h create mode 100644 leveldb/db/dbformat_test.cc create mode 100644 leveldb/db/filename.cc create mode 100644 leveldb/db/filename.h create mode 100644 leveldb/db/filename_test.cc create mode 100644 leveldb/db/log_format.h create mode 100644 leveldb/db/log_reader.cc create mode 100644 leveldb/db/log_reader.h create mode 100644 leveldb/db/log_test.cc create mode 100644 leveldb/db/log_writer.cc create mode 100644 leveldb/db/log_writer.h create mode 100644 leveldb/db/memtable.cc create mode 100644 leveldb/db/memtable.h create mode 100644 leveldb/db/repair.cc create mode 100644 leveldb/db/skiplist.h create mode 100644 leveldb/db/skiplist_test.cc create mode 100644 leveldb/db/snapshot.h create mode 100644 leveldb/db/table_cache.cc create mode 100644 leveldb/db/table_cache.h create mode 100644 leveldb/db/version_edit.cc create mode 100644 leveldb/db/version_edit.h create mode 100644 leveldb/db/version_edit_test.cc create mode 100644 leveldb/db/version_set.cc create mode 100644 leveldb/db/version_set.h create mode 100644 leveldb/db/write_batch.cc create mode 100644 leveldb/db/write_batch_internal.h create mode 100644 leveldb/db/write_batch_test.cc create mode 100644 leveldb/doc/doc.css create mode 100644 leveldb/doc/impl.html create mode 100644 leveldb/doc/index.html create mode 100644 leveldb/doc/log_format.txt create mode 100644 leveldb/doc/table_format.txt create mode 100644 leveldb/include/leveldb/cache.h create mode 100644 leveldb/include/leveldb/comparator.h create mode 100644 leveldb/include/leveldb/db.h create mode 100644 leveldb/include/leveldb/env.h create mode 100644 leveldb/include/leveldb/iterator.h create mode 100644 leveldb/include/leveldb/options.h create mode 100644 leveldb/include/leveldb/slice.h create mode 100644 leveldb/include/leveldb/status.h create mode 100644 leveldb/include/leveldb/table.h create mode 100644 leveldb/include/leveldb/table_builder.h create mode 100644 leveldb/include/leveldb/write_batch.h create mode 100644 leveldb/leveldb.gyp create mode 100644 leveldb/port/README create mode 100644 leveldb/port/port.h create mode 100644 leveldb/port/port_android.cc create mode 100644 leveldb/port/port_android.h create mode 100644 leveldb/port/port_chromium.cc create mode 100644 leveldb/port/port_chromium.h create mode 100644 leveldb/port/port_example.h create mode 100644 leveldb/port/port_posix.cc create mode 100644 leveldb/port/port_posix.h create mode 100644 leveldb/port/win/stdint.h create mode 100644 leveldb/table/block.cc create mode 100644 leveldb/table/block.h create mode 100644 leveldb/table/block_builder.cc create mode 100644 leveldb/table/block_builder.h create mode 100644 leveldb/table/format.cc create mode 100644 leveldb/table/format.h create mode 100644 leveldb/table/iterator.cc create mode 100644 leveldb/table/iterator_wrapper.h create mode 100644 leveldb/table/merger.cc create mode 100644 leveldb/table/merger.h create mode 100644 leveldb/table/table.cc create mode 100644 leveldb/table/table_builder.cc create mode 100644 leveldb/table/table_test.cc create mode 100644 leveldb/table/two_level_iterator.cc create mode 100644 leveldb/table/two_level_iterator.h create mode 100644 leveldb/util/arena.cc create mode 100644 leveldb/util/arena.h create mode 100644 leveldb/util/arena_test.cc create mode 100644 leveldb/util/cache.cc create mode 100644 leveldb/util/cache_test.cc create mode 100644 leveldb/util/coding.cc create mode 100644 leveldb/util/coding.h create mode 100644 leveldb/util/coding_test.cc create mode 100644 leveldb/util/comparator.cc create mode 100644 leveldb/util/crc32c.cc create mode 100644 leveldb/util/crc32c.h create mode 100644 leveldb/util/crc32c_test.cc create mode 100644 leveldb/util/env.cc create mode 100644 leveldb/util/env_chromium.cc create mode 100644 leveldb/util/env_posix.cc create mode 100644 leveldb/util/env_test.cc create mode 100644 leveldb/util/hash.cc create mode 100644 leveldb/util/hash.h create mode 100644 leveldb/util/histogram.cc create mode 100644 leveldb/util/histogram.h create mode 100644 leveldb/util/logging.cc create mode 100644 leveldb/util/logging.h create mode 100644 leveldb/util/mutexlock.h create mode 100644 leveldb/util/options.cc create mode 100644 leveldb/util/random.h create mode 100644 leveldb/util/status.cc create mode 100644 leveldb/util/testharness.cc create mode 100644 leveldb/util/testharness.h create mode 100644 leveldb/util/testutil.cc create mode 100644 leveldb/util/testutil.h delete mode 100644 port/README delete mode 100644 port/port.h delete mode 100644 port/port_android.cc delete mode 100644 port/port_android.h delete mode 100644 port/port_chromium.cc delete mode 100644 port/port_chromium.h delete mode 100644 port/port_example.h delete mode 100644 port/port_posix.cc delete mode 100644 port/port_posix.h delete mode 100644 port/sha1_portable.cc delete mode 100644 port/sha1_portable.h delete mode 100644 port/sha1_test.cc delete mode 100644 port/win/stdint.h delete mode 100644 table/block.cc delete mode 100644 table/block.h delete mode 100644 table/block_builder.cc delete mode 100644 table/block_builder.h delete mode 100644 table/format.cc delete mode 100644 table/format.h delete mode 100644 table/iterator.cc delete mode 100644 table/iterator_wrapper.h delete mode 100644 table/merger.cc delete mode 100644 table/merger.h delete mode 100644 table/table.cc delete mode 100644 table/table_builder.cc delete mode 100644 table/table_test.cc delete mode 100644 table/two_level_iterator.cc delete mode 100644 table/two_level_iterator.h delete mode 100644 util/arena.cc delete mode 100644 util/arena.h delete mode 100644 util/arena_test.cc delete mode 100644 util/cache.cc delete mode 100644 util/cache_test.cc delete mode 100644 util/coding.cc delete mode 100644 util/coding.h delete mode 100644 util/coding_test.cc delete mode 100644 util/comparator.cc delete mode 100644 util/crc32c.cc delete mode 100644 util/crc32c.h delete mode 100644 util/crc32c_test.cc delete mode 100644 util/env.cc delete mode 100644 util/env_chromium.cc delete mode 100644 util/env_posix.cc delete mode 100644 util/env_test.cc delete mode 100644 util/hash.cc delete mode 100644 util/hash.h delete mode 100644 util/histogram.cc delete mode 100644 util/histogram.h delete mode 100644 util/logging.cc delete mode 100644 util/logging.h delete mode 100644 util/mutexlock.h delete mode 100644 util/options.cc delete mode 100644 util/random.h delete mode 100644 util/status.cc delete mode 100644 util/testharness.cc delete mode 100644 util/testharness.h delete mode 100644 util/testutil.cc delete mode 100644 util/testutil.h diff --git a/AUTHORS b/AUTHORS deleted file mode 100644 index 27a9407..0000000 --- a/AUTHORS +++ /dev/null @@ -1,8 +0,0 @@ -# Names should be added to this file like so: -# Name or Organization - -Google Inc. - -# Initial version authors: -Jeffrey Dean -Sanjay Ghemawat diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 8e80208..0000000 --- a/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2011 The LevelDB Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile deleted file mode 100644 index 7569701..0000000 --- a/Makefile +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -CC = g++ - -# Uncomment one of the following to switch between debug and opt mode -#OPT = -O2 -DNDEBUG -OPT = -g2 - -CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -I./include -std=c++0x $(OPT) - -LDFLAGS=-lpthread - -LIBOBJECTS = \ - ./db/builder.o \ - ./db/db_impl.o \ - ./db/db_iter.o \ - ./db/filename.o \ - ./db/dbformat.o \ - ./db/log_reader.o \ - ./db/log_writer.o \ - ./db/memtable.o \ - ./db/repair.o \ - ./db/table_cache.o \ - ./db/version_edit.o \ - ./db/version_set.o \ - ./db/write_batch.o \ - ./port/port_posix.o \ - ./port/sha1_portable.o \ - ./table/block.o \ - ./table/block_builder.o \ - ./table/format.o \ - ./table/iterator.o \ - ./table/merger.o \ - ./table/table.o \ - ./table/table_builder.o \ - ./table/two_level_iterator.o \ - ./util/arena.o \ - ./util/cache.o \ - ./util/coding.o \ - ./util/comparator.o \ - ./util/crc32c.o \ - ./util/env.o \ - ./util/env_posix.o \ - ./util/hash.o \ - ./util/histogram.o \ - ./util/logging.o \ - ./util/options.o \ - ./util/status.o - -TESTUTIL = ./util/testutil.o -TESTHARNESS = ./util/testharness.o $(TESTUTIL) - -TESTS = \ - arena_test \ - cache_test \ - coding_test \ - corruption_test \ - crc32c_test \ - db_test \ - dbformat_test \ - env_test \ - filename_test \ - log_test \ - sha1_test \ - skiplist_test \ - table_test \ - version_edit_test \ - write_batch_test - -PROGRAMS = db_bench $(TESTS) - -all: $(PROGRAMS) - -check: $(TESTS) - for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done - -clean: - rm -f $(PROGRAMS) */*.o - -db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) - $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ - -arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -.cc.o: - $(CC) $(CFLAGS) $< -o $@ - -# TODO(gabor): dependencies for .o files -# TODO(gabor): Build library diff --git a/README b/README deleted file mode 100644 index c97e43c..0000000 --- a/README +++ /dev/null @@ -1,51 +0,0 @@ -leveldb: A key-value store -Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) - -The code under this directory implements a system for maintaining a -persistent key/value store. - -See doc/index.html for more explanation. -See doc/db_layout.txt for a brief overview of the implementation. - -The public interface is in include/*.h. Callers should not include or -rely on the details of any other header files in this package. Those -internal APIs may be changed without warning. - -Guide to header files: - -include/db.h - Main interface to the DB: Start here - -include/options.h - Control over the behavior of an entire database, and also - control over the behavior of individual reads and writes. - -include/comparator.h - Abstraction for user-specified comparison function. If you want - just bytewise comparison of keys, you can use the default comparator, - but clients can write their own comparator implementations if they - want custom ordering (e.g. to handle different character - encodings, etc.) - -include/iterator.h - Interface for iterating over data. You can get an iterator - from a DB object. - -include/write_batch.h - Interface for atomically applying multiple updates to a database. - -include/slice.h - A simple module for maintaining a pointer and a length into some - other byte array. - -include/status.h - Status is returned from many of the public interfaces and is used - to report success and various kinds of errors. - -include/env.h - Abstraction of the OS environment. A posix implementation of - this interface is in util/env_posix.cc - -include/table.h -include/table_builder.h - Lower-level modules that most clients probably won't use directly diff --git a/TODO b/TODO deleted file mode 100644 index 2f848b8..0000000 --- a/TODO +++ /dev/null @@ -1,14 +0,0 @@ -ss -- Stats - -db -- Maybe implement DB::BulkDeleteForRange(start_key, end_key) - that would blow away files whose ranges are entirely contained - within [start_key..end_key]? For Chrome, deletion of obsolete - object stores, etc. can be done in the background anyway, so - probably not that important. - -api changes? -- Efficient large value reading and writing - -Faster Get implementation diff --git a/db/builder.cc b/db/builder.cc deleted file mode 100644 index 6c8e6b8..0000000 --- a/db/builder.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/builder.h" - -#include "db/filename.h" -#include "db/dbformat.h" -#include "db/table_cache.h" -#include "db/version_edit.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" - -namespace leveldb { - -Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta, - VersionEdit* edit) { - Status s; - meta->file_size = 0; - iter->SeekToFirst(); - - std::string fname = TableFileName(dbname, meta->number); - if (iter->Valid()) { - WritableFile* file; - s = env->NewWritableFile(fname, &file); - if (!s.ok()) { - return s; - } - - TableBuilder* builder = new TableBuilder(options, file); - meta->smallest.DecodeFrom(iter->key()); - for (; iter->Valid(); iter->Next()) { - Slice key = iter->key(); - meta->largest.DecodeFrom(key); - if (ExtractValueType(key) == kTypeLargeValueRef) { - if (iter->value().size() != LargeValueRef::ByteSize()) { - s = Status::Corruption("invalid indirect reference hash value (L0)"); - break; - } - edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()), - meta->number, - iter->key()); - } - builder->Add(key, iter->value()); - } - - // Finish and check for builder errors - if (s.ok()) { - s = builder->Finish(); - if (s.ok()) { - meta->file_size = builder->FileSize(); - assert(meta->file_size > 0); - } - } else { - builder->Abandon(); - } - delete builder; - - // Finish and check for file errors - if (s.ok()) { - s = file->Sync(); - } - if (s.ok()) { - s = file->Close(); - } - delete file; - file = NULL; - - if (s.ok()) { - // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), - meta->number, - meta->file_size); - s = it->status(); - delete it; - } - } - - // Check for input iterator errors - if (!iter->status().ok()) { - s = iter->status(); - } - - if (s.ok() && meta->file_size > 0) { - edit->AddFile(0, meta->number, meta->file_size, - meta->smallest, meta->largest); - } else { - env->DeleteFile(fname); - } - return s; -} - -} diff --git a/db/builder.h b/db/builder.h deleted file mode 100644 index 4efcb04..0000000 --- a/db/builder.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ -#define STORAGE_LEVELDB_DB_BUILDER_H_ - -#include "leveldb/status.h" - -namespace leveldb { - -struct Options; -struct FileMetaData; - -class Env; -class Iterator; -class TableCache; -class VersionEdit; - -// Build a Table file from the contents of *iter. The generated file -// will be named according to meta->number. On success, the rest of -// *meta will be filled with metadata about the generated table, and -// large value refs and the added file information will be added to -// *edit. If no data is present in *iter, meta->file_size will be set -// to zero, and no Table file will be produced. -extern Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta, - VersionEdit* edit); - -} - -#endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/db/corruption_test.cc b/db/corruption_test.cc deleted file mode 100644 index 63d8d8b..0000000 --- a/db/corruption_test.cc +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include -#include -#include -#include -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "leveldb/write_batch.h" -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/log_format.h" -#include "db/version_set.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static const int kValueSize = 1000; - -class CorruptionTest { - public: - test::ErrorEnv env_; - Random rnd_; - std::string dbname_; - Cache* tiny_cache_; - Options options_; - DB* db_; - - CorruptionTest() : rnd_(test::RandomSeed()) { - tiny_cache_ = NewLRUCache(100); - options_.env = &env_; - dbname_ = test::TmpDir() + "/db_test"; - DestroyDB(dbname_, options_); - - db_ = NULL; - options_.create_if_missing = true; - Reopen(); - options_.create_if_missing = false; - } - - ~CorruptionTest() { - delete db_; - DestroyDB(dbname_, Options()); - delete tiny_cache_; - } - - Status TryReopen(Options* options = NULL) { - delete db_; - db_ = NULL; - Options opt = (options ? *options : options_); - opt.env = &env_; - opt.block_cache = tiny_cache_; - return DB::Open(opt, dbname_, &db_); - } - - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); - } - - void RepairDB() { - delete db_; - db_ = NULL; - ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); - } - - void Build(int n) { - std::string key_space, value_space; - WriteBatch batch; - for (int i = 0; i < n; i++) { - //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); - Slice key = Key(i, &key_space); - batch.Clear(); - batch.Put(key, Value(i, &value_space)); - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - } - } - - void Check(int min_expected, int max_expected) { - int next_expected = 0; - int missed = 0; - int bad_keys = 0; - int bad_values = 0; - int correct = 0; - std::string value_space; - Iterator* iter = db_->NewIterator(ReadOptions()); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - uint64_t key; - Slice in(iter->key()); - if (!ConsumeDecimalNumber(&in, &key) || - !in.empty() || - key < next_expected) { - bad_keys++; - continue; - } - missed += (key - next_expected); - next_expected = key + 1; - if (iter->value() != Value(key, &value_space)) { - bad_values++; - } else { - correct++; - } - } - delete iter; - - fprintf(stderr, - "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", - min_expected, max_expected, correct, bad_keys, bad_values, missed); - ASSERT_LE(min_expected, correct); - ASSERT_GE(max_expected, correct); - } - - void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { - // Pick file to corrupt - std::vector filenames; - ASSERT_OK(env_.GetChildren(dbname_, &filenames)); - uint64_t number; - LargeValueRef large_ref; - FileType type; - std::vector candidates; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type) && - type == filetype) { - candidates.push_back(dbname_ + "/" + filenames[i]); - } - } - ASSERT_TRUE(!candidates.empty()) << filetype; - std::string fname = candidates[rnd_.Uniform(candidates.size())]; - - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - const char* msg = strerror(errno); - ASSERT_TRUE(false) << fname << ": " << msg; - } - - if (offset < 0) { - // Relative to end of file; make it absolute - if (-offset > sbuf.st_size) { - offset = 0; - } else { - offset = sbuf.st_size + offset; - } - } - if (offset > sbuf.st_size) { - offset = sbuf.st_size; - } - if (offset + bytes_to_corrupt > sbuf.st_size) { - bytes_to_corrupt = sbuf.st_size - offset; - } - - // Do it - std::string contents; - Status s = ReadFileToString(Env::Default(), fname, &contents); - ASSERT_TRUE(s.ok()) << s.ToString(); - for (int i = 0; i < bytes_to_corrupt; i++) { - contents[i + offset] ^= 0x80; - } - s = WriteStringToFile(Env::Default(), contents, fname); - ASSERT_TRUE(s.ok()) << s.ToString(); - } - - int Property(const std::string& name) { - std::string property; - int result; - if (db_->GetProperty(name, &property) && - sscanf(property.c_str(), "%d", &result) == 1) { - return result; - } else { - return -1; - } - } - - // Return the ith key - Slice Key(int i, std::string* storage) { - char buf[100]; - snprintf(buf, sizeof(buf), "%016d", i); - storage->assign(buf, strlen(buf)); - return Slice(*storage); - } - - // Return the value to associate with the specified key - Slice Value(int k, std::string* storage) { - Random r(k); - return test::RandomString(&r, kValueSize, storage); - } -}; - -TEST(CorruptionTest, Recovery) { - Build(100); - Check(100, 100); - Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record - Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block - Reopen(); - - // The 64 records in the first two log blocks are completely lost. - Check(36, 36); -} - -TEST(CorruptionTest, RecoverWriteError) { - env_.writable_file_error_ = true; - Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); -} - -TEST(CorruptionTest, NewFileErrorDuringWrite) { - // Do enough writing to force minor compaction - env_.writable_file_error_ = true; - const int num = 3 + (Options().write_buffer_size / kValueSize); - std::string value_storage; - Status s; - for (int i = 0; s.ok() && i < num; i++) { - WriteBatch batch; - batch.Put("a", Value(100, &value_storage)); - s = db_->Write(WriteOptions(), &batch); - } - ASSERT_TRUE(!s.ok()); - ASSERT_GE(env_.num_writable_file_errors_, 1); - env_.writable_file_error_ = false; - Reopen(); -} - -TEST(CorruptionTest, TableFile) { - Build(100); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); - - Corrupt(kTableFile, 100, 1); - Check(99, 99); -} - -TEST(CorruptionTest, TableFileIndexData) { - Build(10000); // Enough to build multiple Tables - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); - - Corrupt(kTableFile, -2000, 500); - Reopen(); - Check(5000, 9999); -} - -TEST(CorruptionTest, MissingDescriptor) { - Build(1000); - RepairDB(); - Reopen(); - Check(1000, 1000); -} - -TEST(CorruptionTest, SequenceNumberRecovery) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); - RepairDB(); - Reopen(); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v5", v); - // Write something. If sequence number was not recovered properly, - // it will be hidden by an earlier write. - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v6", v); - Reopen(); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v6", v); -} - -TEST(CorruptionTest, LargeValueRecovery) { - Options options; - options.large_value_threshold = 10000; - Reopen(&options); - - Random rnd(301); - std::string big; - ASSERT_OK(db_->Put(WriteOptions(), - "foo", test::RandomString(&rnd, 100000, &big))); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ(big, v); - - RepairDB(); - Reopen(); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ(big, v); - - Reopen(); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ(big, v); -} - -TEST(CorruptionTest, CorruptedDescriptor) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - - Corrupt(kDescriptorFile, 0, 1000); - Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); - - RepairDB(); - Reopen(); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("hello", v); -} - -TEST(CorruptionTest, CompactionInputError) { - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Force compactions by writing lots of values - Build(10000); - Check(10000, 10000); - dbi->TEST_CompactRange(0, "", "~"); - ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); -} - -TEST(CorruptionTest, CompactionInputErrorParanoid) { - Options options; - options.paranoid_checks = true; - options.write_buffer_size = 1048576; - Reopen(&options); - - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Write must eventually fail because of corrupted table - Status s; - std::string tmp1, tmp2; - for (int i = 0; i < 10000 && s.ok(); i++) { - s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); - } - ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; -} - -TEST(CorruptionTest, UnrelatedKeys) { - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - Corrupt(kTableFile, 100, 1); - - std::string tmp1, tmp2; - ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); - ASSERT_EQ(Value(1000, &tmp2).ToString(), v); - dbi->TEST_CompactMemTable(); - ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); - ASSERT_EQ(Value(1000, &tmp2).ToString(), v); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/db/db_bench.cc b/db/db_bench.cc deleted file mode 100644 index 849ebfa..0000000 --- a/db/db_bench.cc +++ /dev/null @@ -1,635 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include "db/db_impl.h" -#include "db/version_set.h" -#include "leveldb/cache.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/write_batch.h" -#include "port/port.h" -#include "util/crc32c.h" -#include "util/histogram.h" -#include "util/random.h" -#include "util/testutil.h" - -// Comma-separated list of operations to run in the specified order -// Actual benchmarks: -// fillseq -- write N values in sequential key order in async mode -// fillrandom -- write N values in random key order in async mode -// overwrite -- overwrite N values in random key order in async mode -// fillsync -- write N/100 values in random key order in sync mode -// fill100K -- write N/1000 100K values in random order in async mode -// readseq -- read N values sequentially -// readreverse -- read N values in reverse order -// readrandom -- read N values in random order -// crc32c -- repeated crc32c of 4K of data -// sha1 -- repeated SHA1 computation over 4K of data -// Meta operations: -// compact -- Compact the entire DB -// stats -- Print DB stats -// heapprofile -- Dump a heap profile (if supported by this port) -static const char* FLAGS_benchmarks = - "fillseq," - "fillsync," - "fillrandom," - "overwrite," - "readrandom," - "readrandom," // Extra run to allow previous compactions to quiesce - "readseq," - "readreverse," - "compact," - "readrandom," - "readseq," - "readreverse," - "fill100K," - "crc32c," - "sha1," - "snappycomp," - "snappyuncomp," - ; - -// Number of key/values to place in database -static int FLAGS_num = 1000000; - -// Size of each value -static int FLAGS_value_size = 100; - -// Arrange to generate values that shrink to this fraction of -// their original size after compression -static double FLAGS_compression_ratio = 0.5; - -// Print histogram of operation timings -static bool FLAGS_histogram = false; - -// Number of bytes to buffer in memtable before compacting -// (initialized to default value by "main") -static int FLAGS_write_buffer_size = 0; - -// Number of bytes to use as a cache of uncompressed data. -// Negative means use default settings. -static int FLAGS_cache_size = -1; - -namespace leveldb { - -// Helper for quickly generating random data. -namespace { -class RandomGenerator { - private: - std::string data_; - int pos_; - - public: - RandomGenerator() { - // We use a limited amount of data over and over again and ensure - // that it is larger than the compression window (32KB), and also - // large enough to serve all typical value sizes we want to write. - Random rnd(301); - std::string piece; - while (data_.size() < 1048576) { - // Add a short fragment that is as compressible as specified - // by FLAGS_compression_ratio. - test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); - data_.append(piece); - } - pos_ = 0; - } - - Slice Generate(int len) { - if (pos_ + len > data_.size()) { - pos_ = 0; - assert(len < data_.size()); - } - pos_ += len; - return Slice(data_.data() + pos_ - len, len); - } -}; - -static Slice TrimSpace(Slice s) { - int start = 0; - while (start < s.size() && isspace(s[start])) { - start++; - } - int limit = s.size(); - while (limit > start && isspace(s[limit-1])) { - limit--; - } - return Slice(s.data() + start, limit - start); -} - -} - -class Benchmark { - private: - Cache* cache_; - DB* db_; - int num_; - int heap_counter_; - double start_; - double last_op_finish_; - int64_t bytes_; - std::string message_; - std::string post_message_; - Histogram hist_; - RandomGenerator gen_; - Random rand_; - - // State kept for progress messages - int done_; - int next_report_; // When to report next - - void PrintHeader() { - const int kKeySize = 16; - PrintEnvironment(); - fprintf(stdout, "Keys: %d bytes each\n", kKeySize); - fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", - FLAGS_value_size, - static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); - fprintf(stdout, "Entries: %d\n", num_); - fprintf(stdout, "RawSize: %.1f MB (estimated)\n", - ((static_cast(kKeySize + FLAGS_value_size) * num_) - / 1048576.0)); - fprintf(stdout, "FileSize: %.1f MB (estimated)\n", - (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) - / 1048576.0)); - PrintWarnings(); - fprintf(stdout, "------------------------------------------------\n"); - } - - void PrintWarnings() { -#if defined(__GNUC__) && !defined(__OPTIMIZE__) - fprintf(stdout, - "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" - ); -#endif -#ifndef NDEBUG - fprintf(stdout, - "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); -#endif - - // See if snappy is working by attempting to compress a compressible string - const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; - std::string compressed; - if (!port::Snappy_Compress(text, sizeof(text), &compressed)) { - fprintf(stdout, "WARNING: Snappy compression is not enabled\n"); - } else if (compressed.size() >= sizeof(text)) { - fprintf(stdout, "WARNING: Snappy compression is not effective\n"); - } - } - - void PrintEnvironment() { - fprintf(stderr, "LevelDB: version %d.%d\n", - kMajorVersion, kMinorVersion); - -#if defined(__linux) - time_t now = time(NULL); - fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline - - FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); - if (cpuinfo != NULL) { - char line[1000]; - int num_cpus = 0; - std::string cpu_type; - std::string cache_size; - while (fgets(line, sizeof(line), cpuinfo) != NULL) { - const char* sep = strchr(line, ':'); - if (sep == NULL) { - continue; - } - Slice key = TrimSpace(Slice(line, sep - 1 - line)); - Slice val = TrimSpace(Slice(sep + 1)); - if (key == "model name") { - ++num_cpus; - cpu_type = val.ToString(); - } else if (key == "cache size") { - cache_size = val.ToString(); - } - } - fclose(cpuinfo); - fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); - fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); - } -#endif - } - - void Start() { - start_ = Env::Default()->NowMicros() * 1e-6; - bytes_ = 0; - message_.clear(); - last_op_finish_ = start_; - hist_.Clear(); - done_ = 0; - next_report_ = 100; - } - - void FinishedSingleOp() { - if (FLAGS_histogram) { - double now = Env::Default()->NowMicros() * 1e-6; - double micros = (now - last_op_finish_) * 1e6; - hist_.Add(micros); - if (micros > 20000) { - fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); - fflush(stderr); - } - last_op_finish_ = now; - } - - done_++; - if (done_ >= next_report_) { - if (next_report_ < 1000) next_report_ += 100; - else if (next_report_ < 5000) next_report_ += 500; - else if (next_report_ < 10000) next_report_ += 1000; - else if (next_report_ < 50000) next_report_ += 5000; - else if (next_report_ < 100000) next_report_ += 10000; - else if (next_report_ < 500000) next_report_ += 50000; - else next_report_ += 100000; - fprintf(stderr, "... finished %d ops%30s\r", done_, ""); - fflush(stderr); - } - } - - void Stop(const Slice& name) { - double finish = Env::Default()->NowMicros() * 1e-6; - - // Pretend at least one op was done in case we are running a benchmark - // that does nto call FinishedSingleOp(). - if (done_ < 1) done_ = 1; - - if (bytes_ > 0) { - char rate[100]; - snprintf(rate, sizeof(rate), "%6.1f MB/s", - (bytes_ / 1048576.0) / (finish - start_)); - if (!message_.empty()) { - message_ = std::string(rate) + " " + message_; - } else { - message_ = rate; - } - } - - fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", - name.ToString().c_str(), - (finish - start_) * 1e6 / done_, - (message_.empty() ? "" : " "), - message_.c_str()); - if (FLAGS_histogram) { - fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); - } - fflush(stdout); - - if (!post_message_.empty()) { - fprintf(stdout, "\n%s\n", post_message_.c_str()); - post_message_.clear(); - } - } - - public: - enum Order { - SEQUENTIAL, - RANDOM - }; - enum DBState { - FRESH, - EXISTING - }; - - Benchmark() - : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), - db_(NULL), - num_(FLAGS_num), - heap_counter_(0), - bytes_(0), - rand_(301) { - std::vector files; - Env::Default()->GetChildren("/tmp/dbbench", &files); - for (int i = 0; i < files.size(); i++) { - if (Slice(files[i]).starts_with("heap-")) { - Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); - } - } - DestroyDB("/tmp/dbbench", Options()); - } - - ~Benchmark() { - delete db_; - delete cache_; - } - - void Run() { - PrintHeader(); - Open(); - - const char* benchmarks = FLAGS_benchmarks; - while (benchmarks != NULL) { - const char* sep = strchr(benchmarks, ','); - Slice name; - if (sep == NULL) { - name = benchmarks; - benchmarks = NULL; - } else { - name = Slice(benchmarks, sep - benchmarks); - benchmarks = sep + 1; - } - - Start(); - - WriteOptions write_options; - bool known = true; - if (name == Slice("fillseq")) { - Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); - } else if (name == Slice("fillbatch")) { - Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000); - } else if (name == Slice("fillrandom")) { - Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1); - } else if (name == Slice("overwrite")) { - Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1); - } else if (name == Slice("fillsync")) { - write_options.sync = true; - Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); - } else if (name == Slice("fill100K")) { - Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); - } else if (name == Slice("readseq")) { - ReadSequential(); - } else if (name == Slice("readreverse")) { - ReadReverse(); - } else if (name == Slice("readrandom")) { - ReadRandom(); - } else if (name == Slice("readrandomsmall")) { - int n = num_; - num_ /= 1000; - ReadRandom(); - num_ = n; - } else if (name == Slice("compact")) { - Compact(); - } else if (name == Slice("crc32c")) { - Crc32c(4096, "(4K per op)"); - } else if (name == Slice("sha1")) { - SHA1(4096, "(4K per op)"); - } else if (name == Slice("snappycomp")) { - SnappyCompress(); - } else if (name == Slice("snappyuncomp")) { - SnappyUncompress(); - } else if (name == Slice("heapprofile")) { - HeapProfile(); - } else if (name == Slice("stats")) { - PrintStats(); - } else { - known = false; - if (name != Slice()) { // No error message for empty name - fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); - } - } - if (known) { - Stop(name); - } - } - } - - private: - void Crc32c(int size, const char* label) { - // Checksum about 500MB of data total - std::string data(size, 'x'); - int64_t bytes = 0; - uint32_t crc = 0; - while (bytes < 500 * 1048576) { - crc = crc32c::Value(data.data(), size); - FinishedSingleOp(); - bytes += size; - } - // Print so result is not dead - fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); - - bytes_ = bytes; - message_ = label; - } - - void SHA1(int size, const char* label) { - // SHA1 about 100MB of data total - std::string data(size, 'x'); - int64_t bytes = 0; - char sha1[20]; - while (bytes < 100 * 1048576) { - port::SHA1_Hash(data.data(), size, sha1); - FinishedSingleOp(); - bytes += size; - } - - // Print so result is not dead - fprintf(stderr, "... sha1=%02x...\r", static_cast(sha1[0])); - - bytes_ = bytes; - message_ = label; - } - - void SnappyCompress() { - Slice input = gen_.Generate(Options().block_size); - int64_t bytes = 0; - int64_t produced = 0; - bool ok = true; - std::string compressed; - while (ok && bytes < 1024 * 1048576) { // Compress 1G - ok = port::Snappy_Compress(input.data(), input.size(), &compressed); - produced += compressed.size(); - bytes += input.size(); - FinishedSingleOp(); - } - - if (!ok) { - message_ = "(snappy failure)"; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "(output: %.1f%%)", - (produced * 100.0) / bytes); - message_ = buf; - bytes_ = bytes; - } - } - - void SnappyUncompress() { - Slice input = gen_.Generate(Options().block_size); - std::string compressed; - bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); - int64_t bytes = 0; - std::string uncompressed; - while (ok && bytes < 1024 * 1048576) { // Compress 1G - ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), - &uncompressed); - bytes += uncompressed.size(); - FinishedSingleOp(); - } - - if (!ok) { - message_ = "(snappy failure)"; - } else { - bytes_ = bytes; - } - } - - void Open() { - assert(db_ == NULL); - Options options; - options.create_if_missing = true; - options.block_cache = cache_; - options.write_buffer_size = FLAGS_write_buffer_size; - Status s = DB::Open(options, "/tmp/dbbench", &db_); - if (!s.ok()) { - fprintf(stderr, "open error: %s\n", s.ToString().c_str()); - exit(1); - } - } - - void Write(const WriteOptions& options, Order order, DBState state, - int num_entries, int value_size, int entries_per_batch) { - if (state == FRESH) { - delete db_; - db_ = NULL; - DestroyDB("/tmp/dbbench", Options()); - Open(); - Start(); // Do not count time taken to destroy/open - } - - if (num_entries != num_) { - char msg[100]; - snprintf(msg, sizeof(msg), "(%d ops)", num_entries); - message_ = msg; - } - - WriteBatch batch; - Status s; - std::string val; - for (int i = 0; i < num_entries; i += entries_per_batch) { - batch.Clear(); - for (int j = 0; j < entries_per_batch; j++) { - const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num); - char key[100]; - snprintf(key, sizeof(key), "%016d", k); - batch.Put(key, gen_.Generate(value_size)); - bytes_ += value_size + strlen(key); - FinishedSingleOp(); - } - s = db_->Write(options, &batch); - if (!s.ok()) { - fprintf(stderr, "put error: %s\n", s.ToString().c_str()); - exit(1); - } - } - } - - void ReadSequential() { - Iterator* iter = db_->NewIterator(ReadOptions()); - int i = 0; - for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - } - - void ReadReverse() { - Iterator* iter = db_->NewIterator(ReadOptions()); - int i = 0; - for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - } - - void ReadRandom() { - ReadOptions options; - std::string value; - for (int i = 0; i < num_; i++) { - char key[100]; - const int k = rand_.Next() % FLAGS_num; - snprintf(key, sizeof(key), "%016d", k); - db_->Get(options, key, &value); - FinishedSingleOp(); - } - } - - void Compact() { - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - std::string property; - char name[100]; - snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); - if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbi->TEST_CompactRange(level, "", "~"); - } - } - - void PrintStats() { - std::string stats; - if (!db_->GetProperty("leveldb.stats", &stats)) { - message_ = "(failed)"; - } else { - post_message_ = stats; - } - } - - static void WriteToFile(void* arg, const char* buf, int n) { - reinterpret_cast(arg)->Append(Slice(buf, n)); - } - - void HeapProfile() { - char fname[100]; - snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); - WritableFile* file; - Status s = Env::Default()->NewWritableFile(fname, &file); - if (!s.ok()) { - message_ = s.ToString(); - return; - } - bool ok = port::GetHeapProfile(WriteToFile, file); - delete file; - if (!ok) { - message_ = "not supported"; - Env::Default()->DeleteFile(fname); - } - } -}; - -} - -int main(int argc, char** argv) { - FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; - for (int i = 1; i < argc; i++) { - double d; - int n; - char junk; - if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { - FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); - } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { - FLAGS_compression_ratio = d; - } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && - (n == 0 || n == 1)) { - FLAGS_histogram = n; - } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { - FLAGS_num = n; - } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { - FLAGS_value_size = n; - } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { - FLAGS_write_buffer_size = n; - } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { - FLAGS_cache_size = n; - } else { - fprintf(stderr, "Invalid flag '%s'\n", argv[i]); - exit(1); - } - } - - leveldb::Benchmark benchmark; - benchmark.Run(); - return 0; -} diff --git a/db/db_impl.cc b/db/db_impl.cc deleted file mode 100644 index d012236..0000000 --- a/db/db_impl.cc +++ /dev/null @@ -1,1345 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_impl.h" - -#include -#include -#include -#include -#include -#include -#include "db/builder.h" -#include "db/db_iter.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/status.h" -#include "leveldb/table.h" -#include "leveldb/table_builder.h" -#include "port/port.h" -#include "table/block.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" -#include "util/mutexlock.h" - -namespace leveldb { - -struct DBImpl::CompactionState { - Compaction* const compaction; - - // Sequence numbers < smallest_snapshot are not significant since we - // will never have to service a snapshot below smallest_snapshot. - // Therefore if we have seen a sequence number S <= smallest_snapshot, - // we can drop all entries for the same key with sequence numbers < S. - SequenceNumber smallest_snapshot; - - // Files produced by compaction - struct Output { - uint64_t number; - uint64_t file_size; - InternalKey smallest, largest; - }; - std::vector outputs; - - // State kept for output being generated - WritableFile* outfile; - TableBuilder* builder; - - uint64_t total_bytes; - - Output* current_output() { return &outputs[outputs.size()-1]; } - - explicit CompactionState(Compaction* c) - : compaction(c), - outfile(NULL), - builder(NULL), - total_bytes(0) { - } -}; - -namespace { -class NullWritableFile : public WritableFile { - public: - virtual Status Append(const Slice& data) { return Status::OK(); } - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } -}; -} - -// Fix user-supplied options to be reasonable -template -static void ClipToRange(T* ptr, V minvalue, V maxvalue) { - if (*ptr > maxvalue) *ptr = maxvalue; - if (*ptr < minvalue) *ptr = minvalue; -} -Options SanitizeOptions(const std::string& dbname, - const InternalKeyComparator* icmp, - const Options& src) { - Options result = src; - result.comparator = icmp; - ClipToRange(&result.max_open_files, 20, 50000); - ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); - ClipToRange(&result.large_value_threshold, 16<<10, 1<<30); - ClipToRange(&result.block_size, 1<<10, 4<<20); - if (result.info_log == NULL) { - // Open a log file in the same directory as the db - src.env->CreateDir(dbname); // In case it does not exist - src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); - Status s = src.env->NewWritableFile(InfoLogFileName(dbname), - &result.info_log); - if (!s.ok()) { - // No place suitable for logging - result.info_log = new NullWritableFile; - } - } - if (result.block_cache == NULL) { - result.block_cache = NewLRUCache(8 << 20); - } - return result; -} - -DBImpl::DBImpl(const Options& options, const std::string& dbname) - : env_(options.env), - internal_comparator_(options.comparator), - options_(SanitizeOptions(dbname, &internal_comparator_, options)), - owns_info_log_(options_.info_log != options.info_log), - owns_cache_(options_.block_cache != options.block_cache), - dbname_(dbname), - db_lock_(NULL), - shutting_down_(NULL), - bg_cv_(&mutex_), - compacting_cv_(&mutex_), - mem_(new MemTable(internal_comparator_)), - imm_(NULL), - logfile_(NULL), - log_(NULL), - bg_compaction_scheduled_(false), - compacting_(false) { - has_imm_.Release_Store(NULL); - - // Reserve ten files or so for other uses and give the rest to TableCache. - const int table_cache_size = options.max_open_files - 10; - table_cache_ = new TableCache(dbname_, &options_, table_cache_size); - - versions_ = new VersionSet(dbname_, &options_, table_cache_, - &internal_comparator_); -} - -DBImpl::~DBImpl() { - // Wait for background work to finish - mutex_.Lock(); - shutting_down_.Release_Store(this); // Any non-NULL value is ok - if (bg_compaction_scheduled_) { - while (bg_compaction_scheduled_) { - bg_cv_.Wait(); - } - } - mutex_.Unlock(); - - if (db_lock_ != NULL) { - env_->UnlockFile(db_lock_); - } - - delete versions_; - delete mem_; - delete imm_; - delete log_; - delete logfile_; - delete table_cache_; - - if (owns_info_log_) { - delete options_.info_log; - } - if (owns_cache_) { - delete options_.block_cache; - } -} - -Status DBImpl::NewDB() { - VersionEdit new_db; - new_db.SetComparatorName(user_comparator()->Name()); - new_db.SetLogNumber(0); - new_db.SetNextFile(2); - new_db.SetLastSequence(0); - - const std::string manifest = DescriptorFileName(dbname_, 1); - WritableFile* file; - Status s = env_->NewWritableFile(manifest, &file); - if (!s.ok()) { - return s; - } - { - log::Writer log(file); - std::string record; - new_db.EncodeTo(&record); - s = log.AddRecord(record); - if (s.ok()) { - s = file->Close(); - } - } - delete file; - if (s.ok()) { - // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1); - } else { - env_->DeleteFile(manifest); - } - return s; -} - -void DBImpl::MaybeIgnoreError(Status* s) const { - if (s->ok() || options_.paranoid_checks) { - // No change needed - } else { - Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str()); - *s = Status::OK(); - } -} - -void DBImpl::DeleteObsoleteFiles() { - // Make a set of all of the live files - std::set live = pending_outputs_; - versions_->AddLiveFiles(&live); - - versions_->CleanupLargeValueRefs(live); - - std::vector filenames; - env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose - uint64_t number; - LargeValueRef large_ref; - FileType type; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type)) { - bool keep = true; - switch (type) { - case kLogFile: - keep = ((number == versions_->LogNumber()) || - (number == versions_->PrevLogNumber())); - break; - case kDescriptorFile: - // Keep my manifest file, and any newer incarnations' - // (in case there is a race that allows other incarnations) - keep = (number >= versions_->ManifestFileNumber()); - break; - case kTableFile: - keep = (live.find(number) != live.end()); - break; - case kTempFile: - // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live" - keep = (live.find(number) != live.end()); - break; - case kLargeValueFile: - keep = versions_->LargeValueIsLive(large_ref); - break; - case kCurrentFile: - case kDBLockFile: - case kInfoLogFile: - keep = true; - break; - } - - if (!keep) { - if (type == kTableFile) { - table_cache_->Evict(number); - } - Log(env_, options_.info_log, "Delete type=%d #%lld\n", - int(type), - static_cast(number)); - env_->DeleteFile(dbname_ + "/" + filenames[i]); - } - } - } -} - -Status DBImpl::Recover(VersionEdit* edit) { - mutex_.AssertHeld(); - - // Ignore error from CreateDir since the creation of the DB is - // committed only when the descriptor is created, and this directory - // may already exist from a previous failed creation attempt. - env_->CreateDir(dbname_); - assert(db_lock_ == NULL); - Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); - if (!s.ok()) { - return s; - } - - if (!env_->FileExists(CurrentFileName(dbname_))) { - if (options_.create_if_missing) { - s = NewDB(); - if (!s.ok()) { - return s; - } - } else { - return Status::InvalidArgument( - dbname_, "does not exist (create_if_missing is false)"); - } - } else { - if (options_.error_if_exists) { - return Status::InvalidArgument( - dbname_, "exists (error_if_exists is true)"); - } - } - - s = versions_->Recover(); - if (s.ok()) { - // Recover from the log files named in the descriptor - SequenceNumber max_sequence(0); - if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log - s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence); - } - if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state - s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence); - } - if (s.ok()) { - if (versions_->LastSequence() < max_sequence) { - versions_->SetLastSequence(max_sequence); - } - } - } - - return s; -} - -Status DBImpl::RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - WritableFile* info_log; - const char* fname; - Status* status; // NULL if options_.paranoid_checks==false - virtual void Corruption(size_t bytes, const Status& s) { - Log(env, info_log, "%s%s: dropping %d bytes; %s", - (this->status == NULL ? "(ignoring error) " : ""), - fname, static_cast(bytes), s.ToString().c_str()); - if (this->status != NULL && this->status->ok()) *this->status = s; - } - }; - - mutex_.AssertHeld(); - - // Open the log file - std::string fname = LogFileName(dbname_, log_number); - SequentialFile* file; - Status status = env_->NewSequentialFile(fname, &file); - if (!status.ok()) { - MaybeIgnoreError(&status); - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log; - reporter.fname = fname.c_str(); - reporter.status = (options_.paranoid_checks ? &status : NULL); - // We intentially make log::Reader do checksumming even if - // paranoid_checks==false so that corruptions cause entire commits - // to be skipped instead of propagating bad information (like overly - // large sequence numbers). - log::Reader reader(file, &reporter, true/*checksum*/); - Log(env_, options_.info_log, "Recovering log #%llu", - (unsigned long long) log_number); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - MemTable* mem = NULL; - while (reader.ReadRecord(&record, &scratch) && - status.ok()) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - - if (mem == NULL) { - mem = new MemTable(internal_comparator_); - } - status = WriteBatchInternal::InsertInto(&batch, mem); - MaybeIgnoreError(&status); - if (!status.ok()) { - break; - } - const SequenceNumber last_seq = - WriteBatchInternal::Sequence(&batch) + - WriteBatchInternal::Count(&batch) - 1; - if (last_seq > *max_sequence) { - *max_sequence = last_seq; - } - - if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { - status = WriteLevel0Table(mem, edit); - if (!status.ok()) { - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - break; - } - delete mem; - mem = NULL; - } - } - - if (status.ok() && mem != NULL) { - status = WriteLevel0Table(mem, edit); - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - } - - delete mem; - delete file; - return status; -} - -Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { - mutex_.AssertHeld(); - const uint64_t start_micros = env_->NowMicros(); - FileMetaData meta; - meta.number = versions_->NewFileNumber(); - pending_outputs_.insert(meta.number); - Iterator* iter = mem->NewIterator(); - Log(env_, options_.info_log, "Level-0 table #%llu: started", - (unsigned long long) meta.number); - - Status s; - { - mutex_.Unlock(); - s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit); - mutex_.Lock(); - } - - Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s", - (unsigned long long) meta.number, - (unsigned long long) meta.file_size, - s.ToString().c_str()); - delete iter; - pending_outputs_.erase(meta.number); - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.file_size; - stats_[0].Add(stats); - return s; -} - -Status DBImpl::CompactMemTable() { - mutex_.AssertHeld(); - assert(imm_ != NULL); - assert(compacting_); - - // Save the contents of the memtable as a new Table - VersionEdit edit; - Status s = WriteLevel0Table(imm_, &edit); - - // Replace immutable memtable with the generated Table - if (s.ok()) { - edit.SetPrevLogNumber(0); - s = versions_->LogAndApply(&edit, imm_); - } - - if (s.ok()) { - // Commit to the new state - imm_ = NULL; - has_imm_.Release_Store(NULL); - DeleteObsoleteFiles(); - } - - compacting_cv_.SignalAll(); // Wake up waiter even if there was an error - return s; -} - -void DBImpl::TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end) { - MutexLock l(&mutex_); - while (compacting_) { - compacting_cv_.Wait(); - } - Compaction* c = versions_->CompactRange( - level, - InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek), - InternalKey(end, 0, static_cast(0))); - - if (c != NULL) { - CompactionState* compact = new CompactionState(c); - DoCompactionWork(compact); // Ignore error in test compaction - CleanupCompaction(compact); - } - - // Start any background compaction that may have been delayed by this thread - MaybeScheduleCompaction(); -} - -Status DBImpl::TEST_CompactMemTable() { - MutexLock l(&mutex_); - Status s = MakeRoomForWrite(true /* force compaction */); - if (s.ok()) { - // Wait until the compaction completes - while (imm_ != NULL && bg_error_.ok()) { - compacting_cv_.Wait(); - } - if (imm_ != NULL) { - s = bg_error_; - } - } - return s; -} - -void DBImpl::MaybeScheduleCompaction() { - mutex_.AssertHeld(); - if (bg_compaction_scheduled_) { - // Already scheduled - } else if (compacting_) { - // Some other thread is running a compaction. Do not conflict with it. - } else if (shutting_down_.Acquire_Load()) { - // DB is being deleted; no more background compactions - } else if (imm_ == NULL && !versions_->NeedsCompaction()) { - // No work to be done - } else { - bg_compaction_scheduled_ = true; - env_->Schedule(&DBImpl::BGWork, this); - } -} - -void DBImpl::BGWork(void* db) { - reinterpret_cast(db)->BackgroundCall(); -} - -void DBImpl::BackgroundCall() { - MutexLock l(&mutex_); - assert(bg_compaction_scheduled_); - if (!shutting_down_.Acquire_Load() && - !compacting_) { - BackgroundCompaction(); - } - bg_compaction_scheduled_ = false; - bg_cv_.SignalAll(); - - // Previous compaction may have produced too many files in a level, - // so reschedule another compaction if needed. - MaybeScheduleCompaction(); -} - -void DBImpl::BackgroundCompaction() { - mutex_.AssertHeld(); - assert(!compacting_); - - if (imm_ != NULL) { - compacting_ = true; - CompactMemTable(); - compacting_ = false; - compacting_cv_.SignalAll(); - return; - } - - Compaction* c = versions_->PickCompaction(); - if (c == NULL) { - // Nothing to do - return; - } - - Status status; - if (c->IsTrivialMove()) { - // Move file to next level - assert(c->num_input_files(0) == 1); - FileMetaData* f = c->input(0, 0); - c->edit()->DeleteFile(c->level(), f->number); - c->edit()->AddFile(c->level() + 1, f->number, f->file_size, - f->smallest, f->largest); - status = versions_->LogAndApply(c->edit(), NULL); - Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", - static_cast(f->number), - c->level() + 1, - static_cast(f->file_size), - status.ToString().c_str()); - } else { - CompactionState* compact = new CompactionState(c); - status = DoCompactionWork(compact); - CleanupCompaction(compact); - } - delete c; - - if (status.ok()) { - // Done - } else if (shutting_down_.Acquire_Load()) { - // Ignore compaction errors found during shutting down - } else { - Log(env_, options_.info_log, - "Compaction error: %s", status.ToString().c_str()); - if (options_.paranoid_checks && bg_error_.ok()) { - bg_error_ = status; - } - } -} - -void DBImpl::CleanupCompaction(CompactionState* compact) { - mutex_.AssertHeld(); - if (compact->builder != NULL) { - // May happen if we get a shutdown call in the middle of compaction - compact->builder->Abandon(); - delete compact->builder; - } else { - assert(compact->outfile == NULL); - } - delete compact->outfile; - for (int i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - pending_outputs_.erase(out.number); - } - delete compact; -} - -Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { - assert(compact != NULL); - assert(compact->builder == NULL); - uint64_t file_number; - { - mutex_.Lock(); - file_number = versions_->NewFileNumber(); - pending_outputs_.insert(file_number); - CompactionState::Output out; - out.number = file_number; - out.smallest.Clear(); - out.largest.Clear(); - compact->outputs.push_back(out); - mutex_.Unlock(); - } - - // Make the output file - std::string fname = TableFileName(dbname_, file_number); - Status s = env_->NewWritableFile(fname, &compact->outfile); - if (s.ok()) { - compact->builder = new TableBuilder(options_, compact->outfile); - } - return s; -} - -Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, - Iterator* input) { - assert(compact != NULL); - assert(compact->outfile != NULL); - assert(compact->builder != NULL); - - const uint64_t output_number = compact->current_output()->number; - assert(output_number != 0); - - // Check for iterator errors - Status s = input->status(); - const uint64_t current_entries = compact->builder->NumEntries(); - if (s.ok()) { - s = compact->builder->Finish(); - } else { - compact->builder->Abandon(); - } - const uint64_t current_bytes = compact->builder->FileSize(); - compact->current_output()->file_size = current_bytes; - compact->total_bytes += current_bytes; - delete compact->builder; - compact->builder = NULL; - - // Finish and check for file errors - if (s.ok()) { - s = compact->outfile->Sync(); - } - if (s.ok()) { - s = compact->outfile->Close(); - } - delete compact->outfile; - compact->outfile = NULL; - - if (s.ok() && current_entries > 0) { - // Verify that the table is usable - Iterator* iter = table_cache_->NewIterator(ReadOptions(), - output_number, - current_bytes); - s = iter->status(); - delete iter; - if (s.ok()) { - Log(env_, options_.info_log, - "Generated table #%llu: %lld keys, %lld bytes", - (unsigned long long) output_number, - (unsigned long long) current_entries, - (unsigned long long) current_bytes); - } - } - return s; -} - - -Status DBImpl::InstallCompactionResults(CompactionState* compact) { - mutex_.AssertHeld(); - Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1, - static_cast(compact->total_bytes)); - - // Add compaction outputs - compact->compaction->AddInputDeletions(compact->compaction->edit()); - const int level = compact->compaction->level(); - for (int i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - compact->compaction->edit()->AddFile( - level + 1, - out.number, out.file_size, out.smallest, out.largest); - pending_outputs_.erase(out.number); - } - compact->outputs.clear(); - - Status s = versions_->LogAndApply(compact->compaction->edit(), NULL); - if (s.ok()) { - compact->compaction->ReleaseInputs(); - DeleteObsoleteFiles(); - } else { - // Discard any files we may have created during this failed compaction - for (int i = 0; i < compact->outputs.size(); i++) { - env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); - } - } - return s; -} - -Status DBImpl::DoCompactionWork(CompactionState* compact) { - const uint64_t start_micros = env_->NowMicros(); - int64_t imm_micros = 0; // Micros spent doing imm_ compactions - - Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1); - - assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); - assert(compact->builder == NULL); - assert(compact->outfile == NULL); - if (snapshots_.empty()) { - compact->smallest_snapshot = versions_->LastSequence(); - } else { - compact->smallest_snapshot = snapshots_.oldest()->number_; - } - - // Release mutex while we're actually doing the compaction work - compacting_ = true; - mutex_.Unlock(); - - Iterator* input = versions_->MakeInputIterator(compact->compaction); - input->SeekToFirst(); - Status status; - ParsedInternalKey ikey; - std::string current_user_key; - bool has_current_user_key = false; - SequenceNumber last_sequence_for_key = kMaxSequenceNumber; - for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Prioritize immutable compaction work - if (has_imm_.NoBarrier_Load() != NULL) { - const uint64_t imm_start = env_->NowMicros(); - mutex_.Lock(); - if (imm_ != NULL) { - CompactMemTable(); - compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary - } - mutex_.Unlock(); - imm_micros += (env_->NowMicros() - imm_start); - } - - Slice key = input->key(); - InternalKey tmp_internal_key; - tmp_internal_key.DecodeFrom(key); - if (compact->compaction->ShouldStopBefore(tmp_internal_key) && - compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - - // Handle key/value, add to state, etc. - bool drop = false; - if (!ParseInternalKey(key, &ikey)) { - // Do not hide error keys - current_user_key.clear(); - has_current_user_key = false; - last_sequence_for_key = kMaxSequenceNumber; - } else { - if (!has_current_user_key || - user_comparator()->Compare(ikey.user_key, - Slice(current_user_key)) != 0) { - // First occurrence of this user key - current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); - has_current_user_key = true; - last_sequence_for_key = kMaxSequenceNumber; - } - - if (last_sequence_for_key <= compact->smallest_snapshot) { - // Hidden by an newer entry for same user key - drop = true; // (A) - } else if (ikey.type == kTypeDeletion && - ikey.sequence <= compact->smallest_snapshot && - compact->compaction->IsBaseLevelForKey(ikey.user_key)) { - // For this user key: - // (1) there is no data in higher levels - // (2) data in lower levels will have larger sequence numbers - // (3) data in layers that are being compacted here and have - // smaller sequence numbers will be dropped in the next - // few iterations of this loop (by rule (A) above). - // Therefore this deletion marker is obsolete and can be dropped. - drop = true; - } - - last_sequence_for_key = ikey.sequence; - } -#if 0 - Log(env_, options_.info_log, - " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " - "%d smallest_snapshot: %d", - ikey.user_key.ToString().c_str(), - (int)ikey.sequence, ikey.type, kTypeLargeValueRef, drop, - compact->compaction->IsBaseLevelForKey(ikey.user_key), - (int)last_sequence_for_key, (int)compact->smallest_snapshot); -#endif - - if (!drop) { - // Open output file if necessary - if (compact->builder == NULL) { - status = OpenCompactionOutputFile(compact); - if (!status.ok()) { - break; - } - } - if (compact->builder->NumEntries() == 0) { - compact->current_output()->smallest.DecodeFrom(key); - } - compact->current_output()->largest.DecodeFrom(key); - - if (ikey.type == kTypeLargeValueRef) { - if (input->value().size() != LargeValueRef::ByteSize()) { - if (options_.paranoid_checks) { - status = Status::Corruption("invalid large value ref"); - break; - } else { - Log(env_, options_.info_log, - "compaction found invalid large value ref"); - } - } else { - compact->compaction->edit()->AddLargeValueRef( - LargeValueRef::FromRef(input->value()), - compact->current_output()->number, - input->key()); - compact->builder->Add(key, input->value()); - } - } else { - compact->builder->Add(key, input->value()); - } - - // Close output file if it is big enough - if (compact->builder->FileSize() >= - compact->compaction->MaxOutputFileSize()) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - } - - input->Next(); - } - - if (status.ok() && shutting_down_.Acquire_Load()) { - status = Status::IOError("Deleting DB during compaction"); - } - if (status.ok() && compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - } - if (status.ok()) { - status = input->status(); - } - delete input; - input = NULL; - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros - imm_micros; - for (int which = 0; which < 2; which++) { - for (int i = 0; i < compact->compaction->num_input_files(which); i++) { - stats.bytes_read += compact->compaction->input(which, i)->file_size; - } - } - for (int i = 0; i < compact->outputs.size(); i++) { - stats.bytes_written += compact->outputs[i].file_size; - } - - mutex_.Lock(); - stats_[compact->compaction->level() + 1].Add(stats); - - if (status.ok()) { - status = InstallCompactionResults(compact); - } - compacting_ = false; - compacting_cv_.SignalAll(); - return status; -} - -Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, - SequenceNumber* latest_snapshot) { - mutex_.Lock(); - *latest_snapshot = versions_->LastSequence(); - - // Collect together all needed child iterators - std::vector list; - list.push_back(mem_->NewIterator()); - if (imm_ != NULL) { - list.push_back(imm_->NewIterator()); - } - versions_->current()->AddIterators(options, &list); - Iterator* internal_iter = - NewMergingIterator(&internal_comparator_, &list[0], list.size()); - versions_->current()->Ref(); - internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current()); - - mutex_.Unlock(); - return internal_iter; -} - -Iterator* DBImpl::TEST_NewInternalIterator() { - SequenceNumber ignored; - return NewInternalIterator(ReadOptions(), &ignored); -} - -int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { - MutexLock l(&mutex_); - return versions_->MaxNextLevelOverlappingBytes(); -} - -Status DBImpl::Get(const ReadOptions& options, - const Slice& key, - std::string* value) { - // TODO(opt): faster implementation - Iterator* iter = NewIterator(options); - iter->Seek(key); - bool found = false; - if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) { - Slice v = iter->value(); - value->assign(v.data(), v.size()); - found = true; - } - // Non-OK iterator status trumps everything else - Status result = iter->status(); - if (result.ok() && !found) { - result = Status::NotFound(Slice()); // Use an empty error message for speed - } - delete iter; - return result; -} - -Iterator* DBImpl::NewIterator(const ReadOptions& options) { - SequenceNumber latest_snapshot; - Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); - SequenceNumber sequence = - (options.snapshot ? options.snapshot->number_ : latest_snapshot); - return NewDBIterator(&dbname_, env_, - user_comparator(), internal_iter, sequence); -} - -void DBImpl::Unref(void* arg1, void* arg2) { - DBImpl* impl = reinterpret_cast(arg1); - Version* v = reinterpret_cast(arg2); - MutexLock l(&impl->mutex_); - v->Unref(); -} - -const Snapshot* DBImpl::GetSnapshot() { - MutexLock l(&mutex_); - return snapshots_.New(versions_->LastSequence()); -} - -void DBImpl::ReleaseSnapshot(const Snapshot* s) { - MutexLock l(&mutex_); - snapshots_.Delete(s); -} - -// Convenience methods -Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { - return DB::Put(o, key, val); -} - -Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { - return DB::Delete(options, key); -} - -Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { - Status status; - - WriteBatch* final = NULL; - { - MutexLock l(&mutex_); - status = MakeRoomForWrite(false); // May temporarily release lock and wait - - uint64_t last_sequence = versions_->LastSequence(); - if (status.ok()) { - status = HandleLargeValues(last_sequence + 1, updates, &final); - } - if (status.ok()) { - WriteBatchInternal::SetSequence(final, last_sequence + 1); - last_sequence += WriteBatchInternal::Count(final); - versions_->SetLastSequence(last_sequence); - - // Add to log and apply to memtable - status = log_->AddRecord(WriteBatchInternal::Contents(final)); - if (status.ok() && options.sync) { - status = logfile_->Sync(); - } - if (status.ok()) { - status = WriteBatchInternal::InsertInto(final, mem_); - } - } - - if (options.post_write_snapshot != NULL) { - *options.post_write_snapshot = - status.ok() ? snapshots_.New(last_sequence) : NULL; - } - } - if (final != updates) { - delete final; - } - - return status; -} - -Status DBImpl::MakeRoomForWrite(bool force) { - mutex_.AssertHeld(); - Status s; - while (true) { - if (!bg_error_.ok()) { - // Yield previous error - s = bg_error_; - break; - } else if (!force && - (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { - // There is room in current memtable - break; - } else if (imm_ != NULL) { - // We have filled up the current memtable, but the previous - // one is still being compacted, so we wait. - compacting_cv_.Wait(); - } else { - // Attempt to switch to a new memtable and trigger compaction of old - assert(versions_->PrevLogNumber() == 0); - uint64_t new_log_number = versions_->NewFileNumber(); - WritableFile* lfile = NULL; - s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); - if (!s.ok()) { - break; - } - VersionEdit edit; - edit.SetPrevLogNumber(versions_->LogNumber()); - edit.SetLogNumber(new_log_number); - s = versions_->LogAndApply(&edit, NULL); - if (!s.ok()) { - delete lfile; - env_->DeleteFile(LogFileName(dbname_, new_log_number)); - break; - } - delete log_; - delete logfile_; - logfile_ = lfile; - log_ = new log::Writer(lfile); - imm_ = mem_; - has_imm_.Release_Store(imm_); - mem_ = new MemTable(internal_comparator_); - force = false; // Do not force another compaction if have room - MaybeScheduleCompaction(); - } - } - return s; -} - -bool DBImpl::HasLargeValues(const WriteBatch& batch) const { - if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) { - for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) { - if (it.op() == kTypeValue && - it.value().size() >= options_.large_value_threshold) { - return true; - } - } - } - return false; -} - -// Given "raw_value", determines the appropriate compression format to use -// and stores the data that should be written to the large value file in -// "*file_bytes", and sets "*ref" to the appropriate large value reference. -// May use "*scratch" as backing store for "*file_bytes". -void DBImpl::MaybeCompressLargeValue( - const Slice& raw_value, - Slice* file_bytes, - std::string* scratch, - LargeValueRef* ref) { - switch (options_.compression) { - case kSnappyCompression: { - if (port::Snappy_Compress(raw_value.data(), raw_value.size(), scratch) && - (scratch->size() < (raw_value.size() / 8) * 7)) { - *file_bytes = *scratch; - *ref = LargeValueRef::Make(raw_value, kSnappyCompression); - return; - } - - // Less than 12.5% compression: just leave as uncompressed data - break; - } - case kNoCompression: - // Use default code outside of switch - break; - } - // Store as uncompressed data - *file_bytes = raw_value; - *ref = LargeValueRef::Make(raw_value, kNoCompression); -} - -Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq, - WriteBatch* updates, - WriteBatch** final) { - if (!HasLargeValues(*updates)) { - // Fast path: no large values found - *final = updates; - } else { - // Copy *updates to a new WriteBatch, replacing the references to - *final = new WriteBatch; - SequenceNumber seq = assigned_seq; - for (WriteBatchInternal::Iterator it(*updates); !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeValue: - if (it.value().size() < options_.large_value_threshold) { - (*final)->Put(it.key(), it.value()); - } else { - std::string scratch; - Slice file_bytes; - LargeValueRef large_ref; - MaybeCompressLargeValue( - it.value(), &file_bytes, &scratch, &large_ref); - InternalKey ikey(it.key(), seq, kTypeLargeValueRef); - if (versions_->RegisterLargeValueRef( - large_ref, versions_->LogNumber(), ikey)) { - // TODO(opt): avoid holding the lock here (but be careful about - // another thread doing a Write and switching logs or - // having us get a different "assigned_seq" value). - - uint64_t tmp_number = versions_->NewFileNumber(); - pending_outputs_.insert(tmp_number); - std::string tmp = TempFileName(dbname_, tmp_number); - WritableFile* file; - Status s = env_->NewWritableFile(tmp, &file); - if (!s.ok()) { - return s; // Caller will delete *final - } - - file->Append(file_bytes); - - s = file->Close(); - delete file; - - if (s.ok()) { - const std::string fname = - LargeValueFileName(dbname_, large_ref); - s = env_->RenameFile(tmp, fname); - } else { - Log(env_, options_.info_log, "Write large value: %s", - s.ToString().c_str()); - } - pending_outputs_.erase(tmp_number); - - if (!s.ok()) { - env_->DeleteFile(tmp); // Cleanup; intentionally ignoring error - return s; // Caller will delete *final - } - } - - // Put an indirect reference in the write batch in place - // of large value - WriteBatchInternal::PutLargeValueRef(*final, it.key(), large_ref); - } - break; - case kTypeLargeValueRef: - return Status::Corruption("Corrupted write batch"); - break; - case kTypeDeletion: - (*final)->Delete(it.key()); - break; - } - seq = seq + 1; - } - } - return Status::OK(); -} - -bool DBImpl::GetProperty(const Slice& property, std::string* value) { - value->clear(); - - MutexLock l(&mutex_); - Slice in = property; - Slice prefix("leveldb."); - if (!in.starts_with(prefix)) return false; - in.remove_prefix(prefix.size()); - - if (in.starts_with("num-files-at-level")) { - in.remove_prefix(strlen("num-files-at-level")); - uint64_t level; - bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || level < 0 || level >= config::kNumLevels) { - return false; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "%d", versions_->NumLevelFiles(level)); - *value = buf; - return true; - } - } else if (in == "stats") { - char buf[200]; - snprintf(buf, sizeof(buf), - " Compactions\n" - "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n" - "--------------------------------------------------\n" - ); - value->append(buf); - for (int level = 0; level < config::kNumLevels; level++) { - int files = versions_->NumLevelFiles(level); - if (stats_[level].micros > 0 || files > 0) { - snprintf( - buf, sizeof(buf), - "%3d %8d %8.0f %9.0f %8.0f %9.0f\n", - level, - files, - versions_->NumLevelBytes(level) / 1048576.0, - stats_[level].micros / 1e6, - stats_[level].bytes_read / 1048576.0, - stats_[level].bytes_written / 1048576.0); - value->append(buf); - } - } - return true; - } - - return false; -} - -void DBImpl::GetApproximateSizes( - const Range* range, int n, - uint64_t* sizes) { - // TODO(opt): better implementation - Version* v; - { - MutexLock l(&mutex_); - versions_->current()->Ref(); - v = versions_->current(); - } - - for (int i = 0; i < n; i++) { - // Convert user_key into a corresponding internal key. - InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); - uint64_t start = versions_->ApproximateOffsetOf(v, k1); - uint64_t limit = versions_->ApproximateOffsetOf(v, k2); - sizes[i] = (limit >= start ? limit - start : 0); - } - - { - MutexLock l(&mutex_); - v->Unref(); - } -} - -// Default implementations of convenience methods that subclasses of DB -// can call if they wish -Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { - WriteBatch batch; - batch.Put(key, value); - return Write(opt, &batch); -} - -Status DB::Delete(const WriteOptions& opt, const Slice& key) { - WriteBatch batch; - batch.Delete(key); - return Write(opt, &batch); -} - -DB::~DB() { } - -Status DB::Open(const Options& options, const std::string& dbname, - DB** dbptr) { - *dbptr = NULL; - - DBImpl* impl = new DBImpl(options, dbname); - impl->mutex_.Lock(); - VersionEdit edit; - Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists - if (s.ok()) { - uint64_t new_log_number = impl->versions_->NewFileNumber(); - WritableFile* lfile; - s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), - &lfile); - if (s.ok()) { - edit.SetLogNumber(new_log_number); - impl->logfile_ = lfile; - impl->log_ = new log::Writer(lfile); - s = impl->versions_->LogAndApply(&edit, NULL); - } - if (s.ok()) { - impl->DeleteObsoleteFiles(); - } - } - impl->mutex_.Unlock(); - if (s.ok()) { - *dbptr = impl; - } else { - delete impl; - } - return s; -} - -Status DestroyDB(const std::string& dbname, const Options& options) { - Env* env = options.env; - std::vector filenames; - // Ignore error in case directory does not exist - env->GetChildren(dbname, &filenames); - if (filenames.empty()) { - return Status::OK(); - } - - FileLock* lock; - Status result = env->LockFile(LockFileName(dbname), &lock); - if (result.ok()) { - uint64_t number; - LargeValueRef large_ref; - FileType type; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type)) { - Status del = env->DeleteFile(dbname + "/" + filenames[i]); - if (result.ok() && !del.ok()) { - result = del; - } - } - } - env->UnlockFile(lock); // Ignore error since state is already gone - env->DeleteFile(LockFileName(dbname)); - env->DeleteDir(dbname); // Ignore error in case dir contains other files - } - return result; -} - -} diff --git a/db/db_impl.h b/db/db_impl.h deleted file mode 100644 index 1f685f0..0000000 --- a/db/db_impl.h +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ -#define STORAGE_LEVELDB_DB_DB_IMPL_H_ - -#include -#include "db/dbformat.h" -#include "db/log_writer.h" -#include "db/snapshot.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "port/port.h" - -namespace leveldb { - -class MemTable; -class TableCache; -class Version; -class VersionEdit; -class VersionSet; - -class DBImpl : public DB { - public: - DBImpl(const Options& options, const std::string& dbname); - virtual ~DBImpl(); - - // Implementations of the DB interface - virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); - virtual Status Delete(const WriteOptions&, const Slice& key); - virtual Status Write(const WriteOptions& options, WriteBatch* updates); - virtual Status Get(const ReadOptions& options, - const Slice& key, - std::string* value); - virtual Iterator* NewIterator(const ReadOptions&); - virtual const Snapshot* GetSnapshot(); - virtual void ReleaseSnapshot(const Snapshot* snapshot); - virtual bool GetProperty(const Slice& property, std::string* value); - virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); - - // Extra methods (for testing) that are not in the public DB interface - - // Compact any files in the named level that overlap [begin,end] - void TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end); - - // Force current memtable contents to be compacted. - Status TEST_CompactMemTable(); - - // Return an internal iterator over the current state of the database. - // The keys of this iterator are internal keys (see format.h). - // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator(); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t TEST_MaxNextLevelOverlappingBytes(); - - private: - friend class DB; - - Iterator* NewInternalIterator(const ReadOptions&, - SequenceNumber* latest_snapshot); - - Status NewDB(); - - // Recover the descriptor from persistent storage. May do a significant - // amount of work to recover recently logged updates. Any changes to - // be made to the descriptor are added to *edit. - Status Recover(VersionEdit* edit); - - void MaybeIgnoreError(Status* s) const; - - // Delete any unneeded files and stale in-memory entries. - void DeleteObsoleteFiles(); - - // Called when an iterator over a particular version of the - // descriptor goes away. - static void Unref(void* arg1, void* arg2); - - // Compact the in-memory write buffer to disk. Switches to a new - // log-file/memtable and writes a new descriptor iff successful. - Status CompactMemTable(); - - Status RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence); - - Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); - - Status MakeRoomForWrite(bool force /* compact even if there is room? */); - bool HasLargeValues(const WriteBatch& batch) const; - - // Process data in "*updates" and return a status. "assigned_seq" - // is the sequence number assigned to the first mod in "*updates". - // If no large values are encountered, "*final" is set to "updates". - // If large values were encountered, registers the references of the - // large values with the VersionSet, writes the large values to - // files (if appropriate), and allocates a new WriteBatch with the - // large values replaced with indirect references and stores a - // pointer to the new WriteBatch in *final. If *final != updates on - // return, then the client should delete *final when no longer - // needed. Returns OK on success, and an appropriate error - // otherwise. - Status HandleLargeValues(SequenceNumber assigned_seq, - WriteBatch* updates, - WriteBatch** final); - - // Helper routine for HandleLargeValues - void MaybeCompressLargeValue( - const Slice& raw_value, - Slice* file_bytes, - std::string* scratch, - LargeValueRef* ref); - - struct CompactionState; - - void MaybeScheduleCompaction(); - static void BGWork(void* db); - void BackgroundCall(); - void BackgroundCompaction(); - void CleanupCompaction(CompactionState* compact); - Status DoCompactionWork(CompactionState* compact); - - Status OpenCompactionOutputFile(CompactionState* compact); - Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); - Status InstallCompactionResults(CompactionState* compact); - - // Constant after construction - Env* const env_; - const InternalKeyComparator internal_comparator_; - const Options options_; // options_.comparator == &internal_comparator_ - bool owns_info_log_; - bool owns_cache_; - const std::string dbname_; - - // table_cache_ provides its own synchronization - TableCache* table_cache_; - - // Lock over the persistent DB state. Non-NULL iff successfully acquired. - FileLock* db_lock_; - - // State below is protected by mutex_ - port::Mutex mutex_; - port::AtomicPointer shutting_down_; - port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ - port::CondVar compacting_cv_; // Signalled when !compacting_ - MemTable* mem_; - MemTable* imm_; // Memtable being compacted - port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ - WritableFile* logfile_; - log::Writer* log_; - SnapshotList snapshots_; - - // Set of table files to protect from deletion because they are - // part of ongoing compactions. - std::set pending_outputs_; - - // Has a background compaction been scheduled or is running? - bool bg_compaction_scheduled_; - - // Is there a compaction running? - bool compacting_; - - VersionSet* versions_; - - // Have we encountered a background error in paranoid mode? - Status bg_error_; - - // Per level compaction stats. stats_[level] stores the stats for - // compactions that produced data for the specified "level". - struct CompactionStats { - int64_t micros; - int64_t bytes_read; - int64_t bytes_written; - - CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { } - - void Add(const CompactionStats& c) { - this->micros += c.micros; - this->bytes_read += c.bytes_read; - this->bytes_written += c.bytes_written; - } - }; - CompactionStats stats_[config::kNumLevels]; - - // No copying allowed - DBImpl(const DBImpl&); - void operator=(const DBImpl&); - - const Comparator* user_comparator() const { - return internal_comparator_.user_comparator(); - } -}; - -// Sanitize db options. The caller should delete result.info_log if -// it is not equal to src.info_log. -extern Options SanitizeOptions(const std::string& db, - const InternalKeyComparator* icmp, - const Options& src); - -} - -#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/db/db_iter.cc b/db/db_iter.cc deleted file mode 100644 index 31c2a38..0000000 --- a/db/db_iter.cc +++ /dev/null @@ -1,397 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_iter.h" - -#include "db/filename.h" -#include "db/dbformat.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/mutexlock.h" - -namespace leveldb { - -#if 0 -static void DumpInternalIter(Iterator* iter) { - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey k; - if (!ParseInternalKey(iter->key(), &k)) { - fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); - } else { - fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); - } - } -} -#endif - -namespace { - -// Memtables and sstables that make the DB representation contain -// (userkey,seq,type) => uservalue entries. DBIter -// combines multiple entries for the same userkey found in the DB -// representation into a single entry while accounting for sequence -// numbers, deletion markers, overwrites, etc. -class DBIter: public Iterator { - public: - // Which direction is the iterator currently moving? - // (1) When moving forward, the internal iterator is positioned at - // the exact entry that yields this->key(), this->value() - // (2) When moving backwards, the internal iterator is positioned - // just before all entries whose user key == this->key(). - enum Direction { - kForward, - kReverse - }; - - DBIter(const std::string* dbname, Env* env, - const Comparator* cmp, Iterator* iter, SequenceNumber s) - : dbname_(dbname), - env_(env), - user_comparator_(cmp), - iter_(iter), - sequence_(s), - large_(NULL), - direction_(kForward), - valid_(false) { - } - virtual ~DBIter() { - delete iter_; - delete large_; - } - virtual bool Valid() const { return valid_; } - virtual Slice key() const { - assert(valid_); - return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_; - } - virtual Slice value() const { - assert(valid_); - Slice raw_value = (direction_ == kForward) ? iter_->value() : saved_value_; - if (large_ == NULL) { - return raw_value; - } else { - MutexLock l(&large_->mutex); - if (!large_->produced) { - ReadIndirectValue(raw_value); - } - return large_->value; - } - } - virtual Status status() const { - if (status_.ok()) { - if (large_ != NULL && !large_->status.ok()) return large_->status; - return iter_->status(); - } else { - return status_; - } - } - - virtual void Next(); - virtual void Prev(); - virtual void Seek(const Slice& target); - virtual void SeekToFirst(); - virtual void SeekToLast(); - - private: - struct Large { - port::Mutex mutex; - std::string value; - bool produced; - Status status; - }; - - void FindNextUserEntry(bool skipping, std::string* skip); - void FindPrevUserEntry(); - bool ParseKey(ParsedInternalKey* key); - void ReadIndirectValue(Slice ref) const; - - inline void SaveKey(const Slice& k, std::string* dst) { - dst->assign(k.data(), k.size()); - } - - inline void ForgetLargeValue() { - if (large_ != NULL) { - delete large_; - large_ = NULL; - } - } - - inline void ClearSavedValue() { - if (saved_value_.capacity() > 1048576) { - std::string empty; - swap(empty, saved_value_); - } else { - saved_value_.clear(); - } - } - - const std::string* const dbname_; - Env* const env_; - const Comparator* const user_comparator_; - Iterator* const iter_; - SequenceNumber const sequence_; - - Status status_; - std::string saved_key_; // == current key when direction_==kReverse - std::string saved_value_; // == current raw value when direction_==kReverse - Large* large_; // Non-NULL if value is an indirect reference - Direction direction_; - bool valid_; - - // No copying allowed - DBIter(const DBIter&); - void operator=(const DBIter&); -}; - -inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { - if (!ParseInternalKey(iter_->key(), ikey)) { - status_ = Status::Corruption("corrupted internal key in DBIter"); - return false; - } else { - return true; - } -} - -void DBIter::Next() { - assert(valid_); - ForgetLargeValue(); - - if (direction_ == kReverse) { // Switch directions? - direction_ = kForward; - // iter_ is pointing just before the entries for this->key(), - // so advance into the range of entries for this->key() and then - // use the normal skipping code below. - if (!iter_->Valid()) { - iter_->SeekToFirst(); - } else { - iter_->Next(); - } - if (!iter_->Valid()) { - valid_ = false; - saved_key_.clear(); - return; - } - } - - // Temporarily use saved_key_ as storage for key to skip. - std::string* skip = &saved_key_; - SaveKey(ExtractUserKey(iter_->key()), skip); - FindNextUserEntry(true, skip); -} - -void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { - // Loop until we hit an acceptable entry to yield - assert(iter_->Valid()); - assert(direction_ == kForward); - assert(large_ == NULL); - do { - ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - switch (ikey.type) { - case kTypeDeletion: - // Arrange to skip all upcoming entries for this key since - // they are hidden by this deletion. - SaveKey(ikey.user_key, skip); - skipping = true; - break; - case kTypeValue: - case kTypeLargeValueRef: - if (skipping && - user_comparator_->Compare(ikey.user_key, *skip) <= 0) { - // Entry hidden - } else { - valid_ = true; - saved_key_.clear(); - if (ikey.type == kTypeLargeValueRef) { - large_ = new Large; - large_->produced = false; - } - return; - } - break; - } - } - iter_->Next(); - } while (iter_->Valid()); - saved_key_.clear(); - valid_ = false; -} - -void DBIter::Prev() { - assert(valid_); - ForgetLargeValue(); - - if (direction_ == kForward) { // Switch directions? - // iter_ is pointing at the current entry. Scan backwards until - // the key changes so we can use the normal reverse scanning code. - assert(iter_->Valid()); // Otherwise valid_ would have been false - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); - while (true) { - iter_->Prev(); - if (!iter_->Valid()) { - valid_ = false; - saved_key_.clear(); - ClearSavedValue(); - return; - } - if (user_comparator_->Compare(ExtractUserKey(iter_->key()), - saved_key_) < 0) { - break; - } - } - direction_ = kReverse; - } - - FindPrevUserEntry(); -} - -void DBIter::FindPrevUserEntry() { - assert(direction_ == kReverse); - assert(large_ == NULL); - - ValueType value_type = kTypeDeletion; - if (iter_->Valid()) { - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); - do { - ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - if ((value_type != kTypeDeletion) && - user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { - // We encountered a non-deleted value in entries for previous keys, - break; - } - value_type = ikey.type; - if (value_type == kTypeDeletion) { - ClearSavedValue(); - } else { - Slice raw_value = iter_->value(); - if (saved_value_.capacity() > raw_value.size() + 1048576) { - std::string empty; - swap(empty, saved_value_); - } - saved_value_.assign(raw_value.data(), raw_value.size()); - } - } - iter_->Prev(); - } while (iter_->Valid()); - } - - if (value_type == kTypeDeletion) { - // End - valid_ = false; - saved_key_.clear(); - ClearSavedValue(); - direction_ = kForward; - } else { - valid_ = true; - if (value_type == kTypeLargeValueRef) { - large_ = new Large; - large_->produced = false; - } - } -} - -void DBIter::Seek(const Slice& target) { - direction_ = kForward; - ForgetLargeValue(); - ClearSavedValue(); - saved_key_.clear(); - AppendInternalKey( - &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); - iter_->Seek(saved_key_); - if (iter_->Valid()) { - FindNextUserEntry(false, &saved_key_ /* temporary storage */); - } else { - valid_ = false; - } -} - -void DBIter::SeekToFirst() { - direction_ = kForward; - ForgetLargeValue(); - ClearSavedValue(); - iter_->SeekToFirst(); - if (iter_->Valid()) { - FindNextUserEntry(false, &saved_key_ /* temporary storage */); - } else { - valid_ = false; - } -} - -void DBIter::SeekToLast() { - direction_ = kReverse; - ForgetLargeValue(); - ClearSavedValue(); - iter_->SeekToLast(); - FindPrevUserEntry(); -} - -void DBIter::ReadIndirectValue(Slice ref) const { - assert(!large_->produced); - large_->produced = true; - LargeValueRef large_ref; - if (ref.size() != LargeValueRef::ByteSize()) { - large_->status = Status::Corruption("malformed large value reference"); - return; - } - memcpy(large_ref.data, ref.data(), LargeValueRef::ByteSize()); - std::string fname = LargeValueFileName(*dbname_, large_ref); - RandomAccessFile* file; - Status s = env_->NewRandomAccessFile(fname, &file); - uint64_t file_size = 0; - if (s.ok()) { - s = env_->GetFileSize(fname, &file_size); - } - if (s.ok()) { - uint64_t value_size = large_ref.ValueSize(); - large_->value.resize(value_size); - Slice result; - s = file->Read(0, file_size, &result, - const_cast(large_->value.data())); - if (s.ok()) { - if (result.size() == file_size) { - switch (large_ref.compression_type()) { - case kNoCompression: { - if (result.data() != large_->value.data()) { - large_->value.assign(result.data(), result.size()); - } - break; - } - case kSnappyCompression: { - std::string uncompressed; - if (port::Snappy_Uncompress(result.data(), result.size(), - &uncompressed) && - uncompressed.size() == large_ref.ValueSize()) { - swap(uncompressed, large_->value); - } else { - s = Status::Corruption( - "Unable to read entire compressed large value file"); - } - } - } - } else { - s = Status::Corruption("Unable to read entire large value file"); - } - } - delete file; // Ignore errors on closing - } - if (!s.ok()) { - large_->value.clear(); - large_->status = s; - } -} - -} // anonymous namespace - -Iterator* NewDBIterator( - const std::string* dbname, - Env* env, - const Comparator* user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence) { - return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); -} - -} diff --git a/db/db_iter.h b/db/db_iter.h deleted file mode 100644 index 195f3d3..0000000 --- a/db/db_iter.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ -#define STORAGE_LEVELDB_DB_DB_ITER_H_ - -#include -#include "leveldb/db.h" -#include "db/dbformat.h" - -namespace leveldb { - -// Return a new iterator that converts internal keys (yielded by -// "*internal_iter") that were live at the specified "sequence" number -// into appropriate user keys. -extern Iterator* NewDBIterator( - const std::string* dbname, - Env* env, - const Comparator* user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence); - -} - -#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/db/db_test.cc b/db/db_test.cc deleted file mode 100644 index 04de331..0000000 --- a/db/db_test.cc +++ /dev/null @@ -1,1211 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} - -class DBTest { - public: - std::string dbname_; - Env* env_; - DB* db_; - - Options last_options_; - - DBTest() : env_(Env::Default()) { - dbname_ = test::TmpDir() + "/db_test"; - DestroyDB(dbname_, Options()); - db_ = NULL; - Reopen(); - } - - ~DBTest() { - delete db_; - DestroyDB(dbname_, Options()); - } - - DBImpl* dbfull() { - return reinterpret_cast(db_); - } - - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); - } - - void DestroyAndReopen(Options* options = NULL) { - delete db_; - db_ = NULL; - DestroyDB(dbname_, Options()); - ASSERT_OK(TryReopen(options)); - } - - Status TryReopen(Options* options) { - delete db_; - db_ = NULL; - Options opts; - if (options != NULL) { - opts = *options; - } else { - opts.create_if_missing = true; - } - last_options_ = opts; - - return DB::Open(opts, dbname_, &db_); - } - - Status Put(const std::string& k, const std::string& v) { - return db_->Put(WriteOptions(), k, v); - } - - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } - - std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { - ReadOptions options; - options.snapshot = snapshot; - std::string result; - Status s = db_->Get(options, k, &result); - if (s.IsNotFound()) { - result = "NOT_FOUND"; - } else if (!s.ok()) { - result = s.ToString(); - } - return result; - } - - std::string AllEntriesFor(const Slice& user_key) { - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); - iter->Seek(target.Encode()); - std::string result; - if (!iter->status().ok()) { - result = iter->status().ToString(); - } else { - result = "[ "; - bool first = true; - while (iter->Valid()) { - ParsedInternalKey ikey; - if (!ParseInternalKey(iter->key(), &ikey)) { - result += "CORRUPTED"; - } else { - if (last_options_.comparator->Compare( - ikey.user_key, user_key) != 0) { - break; - } - if (!first) { - result += ", "; - } - first = false; - switch (ikey.type) { - case kTypeValue: - result += iter->value().ToString(); - break; - case kTypeLargeValueRef: - result += "LARGEVALUE(" + EscapeString(iter->value()) + ")"; - break; - case kTypeDeletion: - result += "DEL"; - break; - } - } - iter->Next(); - } - if (!first) { - result += " "; - } - result += "]"; - } - delete iter; - return result; - } - - int NumTableFilesAtLevel(int level) { - std::string property; - ASSERT_TRUE( - db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), - &property)); - return atoi(property.c_str()); - } - - uint64_t Size(const Slice& start, const Slice& limit) { - Range r(start, limit); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - - std::set LargeValueFiles() const { - // Return the set of large value files that exist in the database - std::vector filenames; - env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose - uint64_t number; - LargeValueRef large_ref; - FileType type; - std::set live; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type) && - type == kLargeValueFile) { - fprintf(stderr, " live: %s\n", - LargeValueRefToFilenameString(large_ref).c_str()); - live.insert(large_ref); - } - } - fprintf(stderr, "Found %d live large value files\n", (int)live.size()); - return live; - } - - void Compact(const Slice& start, const Slice& limit) { - dbfull()->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - if (NumTableFilesAtLevel(level) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbfull()->TEST_CompactRange(level, "", "~"); - } - } - - void DumpFileCounts(const char* label) { - fprintf(stderr, "---\n%s:\n", label); - fprintf(stderr, "maxoverlap: %lld\n", - static_cast( - dbfull()->TEST_MaxNextLevelOverlappingBytes())); - for (int level = 0; level < config::kNumLevels; level++) { - int num = NumTableFilesAtLevel(level); - if (num > 0) { - fprintf(stderr, " level %3d : %d files\n", level, num); - } - } - } - - std::string IterStatus(Iterator* iter) { - std::string result; - if (iter->Valid()) { - result = iter->key().ToString() + "->" + iter->value().ToString(); - } else { - result = "(invalid)"; - } - return result; - } -}; - -TEST(DBTest, Empty) { - ASSERT_TRUE(db_ != NULL); - ASSERT_EQ("NOT_FOUND", Get("foo")); -} - -TEST(DBTest, ReadWrite) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v2", Get("bar")); -} - -TEST(DBTest, PutDeleteGet) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - ASSERT_OK(db_->Delete(WriteOptions(), "foo")); - ASSERT_EQ("NOT_FOUND", Get("foo")); -} - -TEST(DBTest, IterEmpty) { - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("foo"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterSingle) { - ASSERT_OK(Put("a", "va")); - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterMulti) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", "vb")); - ASSERT_OK(Put("c", "vc")); - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("ax"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("z"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - // Switch from reverse to forward - iter->SeekToLast(); - iter->Prev(); - iter->Prev(); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Switch from forward to reverse - iter->SeekToFirst(); - iter->Next(); - iter->Next(); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Make sure iter stays at snapshot - ASSERT_OK(Put("a", "va2")); - ASSERT_OK(Put("a2", "va3")); - ASSERT_OK(Put("b", "vb2")); - ASSERT_OK(Put("c", "vc2")); - ASSERT_OK(Delete("b")); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterSmallAndLargeMix) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", std::string(100000, 'b'))); - ASSERT_OK(Put("c", "vc")); - ASSERT_OK(Put("d", std::string(100000, 'd'))); - ASSERT_OK(Put("e", std::string(100000, 'e'))); - - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, Recover) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("baz", "v5")); - - Reopen(); - ASSERT_EQ("v1", Get("foo")); - - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v5", Get("baz")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - - Reopen(); - ASSERT_EQ("v3", Get("foo")); - ASSERT_OK(Put("foo", "v4")); - ASSERT_EQ("v4", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ("v5", Get("baz")); -} - -TEST(DBTest, RecoveryWithEmptyLog) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("foo", "v2")); - Reopen(); - Reopen(); - ASSERT_OK(Put("foo", "v3")); - Reopen(); - ASSERT_EQ("v3", Get("foo")); -} - -static std::string Key(int i) { - char buf[100]; - snprintf(buf, sizeof(buf), "key%06d", i); - return std::string(buf); -} - -TEST(DBTest, MinorCompactionsHappen) { - Options options; - options.write_buffer_size = 10000; - Reopen(&options); - - const int N = 500; - - int starting_num_tables = NumTableFilesAtLevel(0); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); - } - int ending_num_tables = NumTableFilesAtLevel(0); - ASSERT_GT(ending_num_tables, starting_num_tables); - - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); - } - - Reopen(); - - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); - } -} - -TEST(DBTest, RecoverWithLargeLog) { - { - Options options; - options.large_value_threshold = 1048576; - Reopen(&options); - ASSERT_OK(Put("big1", std::string(200000, '1'))); - ASSERT_OK(Put("big2", std::string(200000, '2'))); - ASSERT_OK(Put("small3", std::string(10, '3'))); - ASSERT_OK(Put("small4", std::string(10, '4'))); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - } - - // Make sure that if we re-open with a small write buffer size that - // we flush table files in the middle of a large log file. - Options options; - options.write_buffer_size = 100000; - options.large_value_threshold = 1048576; - Reopen(&options); - ASSERT_EQ(NumTableFilesAtLevel(0), 3); - ASSERT_EQ(std::string(200000, '1'), Get("big1")); - ASSERT_EQ(std::string(200000, '2'), Get("big2")); - ASSERT_EQ(std::string(10, '3'), Get("small3")); - ASSERT_EQ(std::string(10, '4'), Get("small4")); - ASSERT_GT(NumTableFilesAtLevel(0), 1); -} - -TEST(DBTest, CompactionsGenerateMultipleFiles) { - Options options; - options.write_buffer_size = 100000000; // Large write buffer - options.large_value_threshold = 1048576; - Reopen(&options); - - Random rnd(301); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - std::vector values; - for (int i = 0; i < 80; i++) { - values.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(Key(i), values[i])); - } - - // Reopening moves updates to level-0 - Reopen(&options); - dbfull()->TEST_CompactRange(0, "", Key(100000)); - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 1); - for (int i = 0; i < 80; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); - } -} - -TEST(DBTest, SparseMerge) { - Options options; - options.compression = kNoCompression; - Reopen(&options); - - // Suppose there is: - // small amount of data with prefix A - // large amount of data with prefix B - // small amount of data with prefix C - // and that recent updates have made small changes to all three prefixes. - // Check that we do not do a compaction that merges all of B in one shot. - const std::string value(1000, 'x'); - Put("A", "va"); - // Write approximately 100MB of "B" values - for (int i = 0; i < 100000; i++) { - char key[100]; - snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); - } - Put("C", "vc"); - Compact("", "z"); - - // Make sparse update - Put("A", "va2"); - Put("B100", "bvalue2"); - Put("C", "vc2"); - dbfull()->TEST_CompactMemTable(); - - // Compactions should not cause us to create a situation where - // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); -} - -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; -} - -TEST(DBTest, ApproximateSizes) { - for (int test = 0; test < 2; test++) { - // test==0: default large_value_threshold - // test==1: 1 MB large_value_threshold - Options options; - options.large_value_threshold = (test == 0) ? 65536 : 1048576; - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - DestroyAndReopen(); - - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - Reopen(&options); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - const int N = 80; - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); - } - if (test == 1) { - // 0 because GetApproximateSizes() does not account for memtable space for - // non-large values - ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - } else { - ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000)); - ASSERT_TRUE(Between(Size(Key(20), Key(30)), - 100000*10, 100000*10 + 10000)); - } - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - for (int compact_start = 0; compact_start < N; compact_start += 10) { - for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); - ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), - 100000 * (i+1), 100000 * (i+1) + 10000)); - ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), - 100000 * 10, 100000 * 10 + 10000)); - } - ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); - ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); - - dbfull()->TEST_CompactRange(0, - Key(compact_start), - Key(compact_start + 9)); - } - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 0); - } - } -} - -TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { - Options options; - options.large_value_threshold = 65536; - options.compression = kNoCompression; - Reopen(); - - Random rnd(301); - std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(2), big1)); - ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(4), big1)); - ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); - - ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); - - dbfull()->TEST_CompactRange(0, Key(0), Key(100)); - } -} - -TEST(DBTest, IteratorPinsRef) { - Put("foo", "hello"); - - // Get iterator that will yield the current contents of the DB. - Iterator* iter = db_->NewIterator(ReadOptions()); - - // Write to force compactions - Put("foo", "newvalue1"); - for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values - } - Put("foo", "newvalue2"); - - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("hello", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - delete iter; -} - -TEST(DBTest, Snapshot) { - Put("foo", "v1"); - const Snapshot* s1 = db_->GetSnapshot(); - Put("foo", "v2"); - const Snapshot* s2 = db_->GetSnapshot(); - Put("foo", "v3"); - const Snapshot* s3 = db_->GetSnapshot(); - - Put("foo", "v4"); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v3", Get("foo", s3)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s3); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s1); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s2); - ASSERT_EQ("v4", Get("foo")); -} - -TEST(DBTest, HiddenValuesAreRemoved) { - Random rnd(301); - std::string big = RandomString(&rnd, 50000); - Put("foo", big); - Put("pastfoo", "v"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put("foo", "tiny"); - Put("pastfoo2", "v2"); // Advance sequence number one more - - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_GT(NumTableFilesAtLevel(0), 0); - - ASSERT_EQ(big, Get("foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); - db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); - dbfull()->TEST_CompactRange(0, "", "x"); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GE(NumTableFilesAtLevel(1), 1); - dbfull()->TEST_CompactRange(1, "", "x"); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - - ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); -} - -TEST(DBTest, DeletionMarkers1) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file - Delete("foo"); - Put("foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - dbfull()->TEST_CompactRange(0, "", "z"); - // DEL eliminated, but v1 remains because we aren't compacting that level - // (DEL can be eliminated because v2 hides v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(1, "", "z"); - // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. - // (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); -} - -TEST(DBTest, DeletionMarkers2) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file - Delete("foo"); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(0, "", "z"); - // DEL kept: L2 file overlaps - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(1, "", "z"); - // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. - // (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); -} - -TEST(DBTest, ComparatorCheck) { - class NewComparator : public Comparator { - public: - virtual const char* Name() const { return "leveldb.NewComparator"; } - virtual int Compare(const Slice& a, const Slice& b) const { - return BytewiseComparator()->Compare(a, b); - } - virtual void FindShortestSeparator(std::string* s, const Slice& l) const { - BytewiseComparator()->FindShortestSeparator(s, l); - } - virtual void FindShortSuccessor(std::string* key) const { - BytewiseComparator()->FindShortSuccessor(key); - } - }; - NewComparator cmp; - Options new_options; - new_options.comparator = &cmp; - Status s = TryReopen(&new_options); - ASSERT_TRUE(!s.ok()); - ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) - << s.ToString(); -} - -static bool LargeValuesOK(DBTest* db, - const std::set& expected) { - std::set actual = db->LargeValueFiles(); - if (actual.size() != expected.size()) { - fprintf(stderr, "Sets differ in size: %d vs %d\n", - (int)actual.size(), (int)expected.size()); - return false; - } - for (std::set::const_iterator it = expected.begin(); - it != expected.end(); - ++it) { - if (actual.count(*it) != 1) { - fprintf(stderr, " key '%s' not found in actual set\n", - LargeValueRefToFilenameString(*it).c_str()); - return false; - } - } - return true; -} - -TEST(DBTest, LargeValues1) { - Options options; - options.large_value_threshold = 10000; - Reopen(&options); - - Random rnd(301); - - std::string big1; - test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible - std::set expected; - - ASSERT_OK(Put("big1", big1)); - expected.insert(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Delete("big1")); - ASSERT_TRUE(LargeValuesOK(this, expected)); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - // No handling of deletion markers on memtable compactions, so big1 remains - ASSERT_TRUE(LargeValuesOK(this, expected)); - - dbfull()->TEST_CompactRange(0, "", "z"); - expected.erase(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_TRUE(LargeValuesOK(this, expected)); -} - -static bool SnappyCompressionSupported() { - std::string out; - Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Snappy_Compress(in.data(), in.size(), &out); -} - -TEST(DBTest, LargeValues2) { - Options options; - options.large_value_threshold = 10000; - Reopen(&options); - - Random rnd(301); - - std::string big1, big2; - test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible - test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible - std::set expected; - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Put("big1", big1)); - expected.insert(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_EQ(big1, Get("big1")); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Put("big2", big2)); - ASSERT_EQ(big2, Get("big2")); - if (SnappyCompressionSupported()) { - expected.insert(LargeValueRef::Make(big2, kSnappyCompression)); - } else { - expected.insert(LargeValueRef::Make(big2, kNoCompression)); - } - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Put("big2", big2)); - ASSERT_OK(Put("big2_b", big2)); - ASSERT_EQ(big1, Get("big1")); - ASSERT_EQ(big2, Get("big2")); - ASSERT_EQ(big2, Get("big2_b")); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Delete("big1")); - ASSERT_EQ("NOT_FOUND", Get("big1")); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_TRUE(LargeValuesOK(this, expected)); - dbfull()->TEST_CompactRange(0, "", "z"); - expected.erase(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_TRUE(LargeValuesOK(this, expected)); - dbfull()->TEST_CompactRange(1, "", "z"); - - ASSERT_OK(Delete("big2")); - ASSERT_EQ("NOT_FOUND", Get("big2")); - ASSERT_EQ(big2, Get("big2_b")); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_TRUE(LargeValuesOK(this, expected)); - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - // Make sure the large value refs survive a reload and compactions after - // the reload. - Reopen(); - ASSERT_TRUE(LargeValuesOK(this, expected)); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_TRUE(LargeValuesOK(this, expected)); -} - -TEST(DBTest, LargeValues3) { - // Make sure we don't compress values if - Options options; - options.large_value_threshold = 10000; - options.compression = kNoCompression; - Reopen(&options); - - Random rnd(301); - - std::string big1 = std::string(100000, 'x'); // Very compressible - std::set expected; - - ASSERT_OK(Put("big1", big1)); - ASSERT_EQ(big1, Get("big1")); - expected.insert(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_TRUE(LargeValuesOK(this, expected)); -} - - -TEST(DBTest, DBOpen_Options) { - std::string dbname = test::TmpDir() + "/db_options_test"; - DestroyDB(dbname, Options()); - - // Does not exist, and create_if_missing == false: error - DB* db = NULL; - Options opts; - opts.create_if_missing = false; - Status s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); - ASSERT_TRUE(db == NULL); - - // Does not exist, and create_if_missing == true: OK - opts.create_if_missing = true; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; - - // Does exist, and error_if_exists == true: error - opts.create_if_missing = false; - opts.error_if_exists = true; - s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); - ASSERT_TRUE(db == NULL); - - // Does exist, and error_if_exists == false: OK - opts.create_if_missing = true; - opts.error_if_exists = false; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; -} - -class ModelDB: public DB { - public: - explicit ModelDB(const Options& options): options_(options) { } - ~ModelDB() { } - virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { - return DB::Put(o, k, v); - } - virtual Status Delete(const WriteOptions& o, const Slice& key) { - return DB::Delete(o, key); - } - virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) { - assert(false); // Not implemented - return Status::NotFound(key); - } - virtual Iterator* NewIterator(const ReadOptions& options) { - if (options.snapshot == NULL) { - KVMap* saved = new KVMap; - *saved = map_; - return new ModelIter(saved, true); - } else { - const KVMap* snapshot_state = - reinterpret_cast(options.snapshot->number_); - return new ModelIter(snapshot_state, false); - } - } - virtual const Snapshot* GetSnapshot() { - KVMap* saved = new KVMap; - *saved = map_; - return snapshots_.New( - reinterpret_cast(saved)); - } - - virtual void ReleaseSnapshot(const Snapshot* snapshot) { - const KVMap* saved = reinterpret_cast(snapshot->number_); - delete saved; - snapshots_.Delete(snapshot); - } - virtual Status Write(const WriteOptions& options, WriteBatch* batch) { - assert(options.post_write_snapshot == NULL); // Not supported - for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeValue: - map_[it.key().ToString()] = it.value().ToString(); - break; - case kTypeLargeValueRef: - assert(false); // Should not occur - break; - case kTypeDeletion: - map_.erase(it.key().ToString()); - break; - } - } - return Status::OK(); - } - - virtual bool GetProperty(const Slice& property, std::string* value) { - return false; - } - virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { - for (int i = 0; i < n; i++) { - sizes[i] = 0; - } - } - private: - typedef std::map KVMap; - class ModelIter: public Iterator { - public: - ModelIter(const KVMap* map, bool owned) - : map_(map), owned_(owned), iter_(map_->end()) { - } - ~ModelIter() { - if (owned_) delete map_; - } - virtual bool Valid() const { return iter_ != map_->end(); } - virtual void SeekToFirst() { iter_ = map_->begin(); } - virtual void SeekToLast() { - if (map_->empty()) { - iter_ = map_->end(); - } else { - iter_ = map_->find(map_->rbegin()->first); - } - } - virtual void Seek(const Slice& k) { - iter_ = map_->lower_bound(k.ToString()); - } - virtual void Next() { ++iter_; } - virtual void Prev() { --iter_; } - virtual Slice key() const { return iter_->first; } - virtual Slice value() const { return iter_->second; } - virtual Status status() const { return Status::OK(); } - private: - const KVMap* const map_; - const bool owned_; // Do we own map_ - KVMap::const_iterator iter_; - }; - const Options options_; - KVMap map_; - SnapshotList snapshots_; -}; - -static std::string RandomKey(Random* rnd) { - int len = (rnd->OneIn(3) - ? 1 // Short sometimes to encourage collisions - : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); - return test::RandomKey(rnd, len); -} - -static bool CompareIterators(int step, - DB* model, - DB* db, - const Snapshot* model_snap, - const Snapshot* db_snap) { - ReadOptions options; - options.snapshot = model_snap; - Iterator* miter = model->NewIterator(options); - options.snapshot = db_snap; - Iterator* dbiter = db->NewIterator(options); - bool ok = true; - int count = 0; - for (miter->SeekToFirst(), dbiter->SeekToFirst(); - ok && miter->Valid() && dbiter->Valid(); - miter->Next(), dbiter->Next()) { - count++; - if (miter->key().compare(dbiter->key()) != 0) { - fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(dbiter->key()).c_str()); - ok = false; - break; - } - - if (miter->value().compare(dbiter->value()) != 0) { - fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(miter->value()).c_str(), - EscapeString(miter->value()).c_str()); - ok = false; - } - } - - if (ok) { - if (miter->Valid() != dbiter->Valid()) { - fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", - step, miter->Valid(), dbiter->Valid()); - ok = false; - } - } - fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); - delete miter; - delete dbiter; - return ok; -} - -TEST(DBTest, Randomized) { - Random rnd(test::RandomSeed()); - ModelDB model(last_options_); - const int N = 10000; - const Snapshot* model_snap = NULL; - const Snapshot* db_snap = NULL; - std::string k, v; - for (int step = 0; step < N; step++) { - if (step % 100 == 0) { - fprintf(stderr, "Step %d of %d\n", step, N); - } - int p = rnd.Uniform(100); - if (p < 45) { // Put - k = RandomKey(&rnd); - v = RandomString(&rnd, - rnd.OneIn(20) - ? 100 + rnd.Uniform(100) - : rnd.Uniform(8)); - ASSERT_OK(model.Put(WriteOptions(), k, v)); - ASSERT_OK(db_->Put(WriteOptions(), k, v)); - - } else if (p < 90) { // Delete - k = RandomKey(&rnd); - ASSERT_OK(model.Delete(WriteOptions(), k)); - ASSERT_OK(db_->Delete(WriteOptions(), k)); - - - } else { // Multi-element batch - WriteBatch b; - const int num = rnd.Uniform(8); - for (int i = 0; i < num; i++) { - if (i == 0 || !rnd.OneIn(10)) { - k = RandomKey(&rnd); - } else { - // Periodically re-use the same key from the previous iter, so - // we have multiple entries in the write batch for the same key - } - if (rnd.OneIn(2)) { - v = RandomString(&rnd, rnd.Uniform(10)); - b.Put(k, v); - } else { - b.Delete(k); - } - } - ASSERT_OK(model.Write(WriteOptions(), &b)); - ASSERT_OK(db_->Write(WriteOptions(), &b)); - } - - if ((step % 100) == 0) { - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); - // Save a snapshot from each DB this time that we'll use next - // time we compare things, to make sure the current state is - // preserved with the snapshot - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); - - Reopen(); - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - - model_snap = model.GetSnapshot(); - db_snap = db_->GetSnapshot(); - } - } - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/db/dbformat.cc b/db/dbformat.cc deleted file mode 100644 index 2664eb4..0000000 --- a/db/dbformat.cc +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "db/dbformat.h" -#include "port/port.h" -#include "util/coding.h" - -namespace leveldb { - -static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { - assert(seq <= kMaxSequenceNumber); - assert(t <= kValueTypeForSeek); - return (seq << 8) | t; -} - -void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { - result->append(key.user_key.data(), key.user_key.size()); - PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); -} - -std::string ParsedInternalKey::DebugString() const { - char buf[50]; - snprintf(buf, sizeof(buf), "' @ %llu : %d", - (unsigned long long) sequence, - int(type)); - std::string result = "'"; - result += user_key.ToString(); - result += buf; - return result; -} - -const char* InternalKeyComparator::Name() const { - return "leveldb.InternalKeyComparator"; -} - -int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { - // Order by: - // increasing user key (according to user-supplied comparator) - // decreasing sequence number - // decreasing type (though sequence# should be enough to disambiguate) - int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); - if (r == 0) { - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); - if (anum > bnum) { - r = -1; - } else if (anum < bnum) { - r = +1; - } - } - return r; -} - -void InternalKeyComparator::FindShortestSeparator( - std::string* start, - const Slice& limit) const { - // Attempt to shorten the user portion of the key - Slice user_start = ExtractUserKey(*start); - Slice user_limit = ExtractUserKey(limit); - std::string tmp(user_start.data(), user_start.size()); - user_comparator_->FindShortestSeparator(&tmp, user_limit); - if (user_comparator_->Compare(*start, tmp) < 0) { - // User key has become larger. Tack on the earliest possible - // number to the shortened user key. - PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); - assert(this->Compare(*start, tmp) < 0); - assert(this->Compare(tmp, limit) < 0); - start->swap(tmp); - } -} - -void InternalKeyComparator::FindShortSuccessor(std::string* key) const { - Slice user_key = ExtractUserKey(*key); - std::string tmp(user_key.data(), user_key.size()); - user_comparator_->FindShortSuccessor(&tmp); - if (user_comparator_->Compare(user_key, tmp) < 0) { - // User key has become larger. Tack on the earliest possible - // number to the shortened user key. - PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); - assert(this->Compare(*key, tmp) < 0); - key->swap(tmp); - } -} - -LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) { - LargeValueRef result; - port::SHA1_Hash(value.data(), value.size(), &result.data[0]); - EncodeFixed64(&result.data[20], value.size()); - result.data[28] = static_cast(ctype); - return result; -} - -std::string LargeValueRefToFilenameString(const LargeValueRef& h) { - assert(sizeof(h.data) == LargeValueRef::ByteSize()); - assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf - static const char tohex[] = "0123456789abcdef"; - char buf[20*2]; - for (int i = 0; i < 20; i++) { - buf[2*i] = tohex[(h.data[i] >> 4) & 0xf]; - buf[2*i+1] = tohex[h.data[i] & 0xf]; - } - std::string result = std::string(buf, sizeof(buf)); - result += "-"; - result += NumberToString(h.ValueSize()); - result += "-"; - result += NumberToString(static_cast(h.compression_type())); - return result; -} - -static uint32_t hexvalue(char c) { - if (c >= '0' && c <= '9') { - return c - '0'; - } else if (c >= 'A' && c <= 'F') { - return 10 + c - 'A'; - } else { - assert(c >= 'a' && c <= 'f'); - return 10 + c - 'a'; - } -} - -bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) { - Slice in = s; - if (in.size() < 40) { - return false; - } - for (int i = 0; i < 20; i++) { - if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) { - return false; - } - unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]); - h->data[i] = c; - } - in.remove_prefix(40); - uint64_t value_size, ctype; - - if (ConsumeChar(&in, '-') && - ConsumeDecimalNumber(&in, &value_size) && - ConsumeChar(&in, '-') && - ConsumeDecimalNumber(&in, &ctype) && - in.empty() && - (ctype <= kSnappyCompression)) { - EncodeFixed64(&h->data[20], value_size); - h->data[28] = static_cast(ctype); - return true; - } else { - return false; - } -} - -} diff --git a/db/dbformat.h b/db/dbformat.h deleted file mode 100644 index 5f117f9..0000000 --- a/db/dbformat.h +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ -#define STORAGE_LEVELDB_DB_FORMAT_H_ - -#include -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/slice.h" -#include "leveldb/table_builder.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -// Grouping of constants. We may want to make some of these -// parameters set via options. -namespace config { -static const int kNumLevels = 7; -} - -class InternalKey; - -// Value types encoded as the last component of internal keys. -// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk -// data structures. -enum ValueType { - kTypeDeletion = 0x0, - kTypeValue = 0x1, - kTypeLargeValueRef = 0x2, -}; -// kValueTypeForSeek defines the ValueType that should be passed when -// constructing a ParsedInternalKey object for seeking to a particular -// sequence number (since we sort sequence numbers in decreasing order -// and the value type is embedded as the low 8 bits in the sequence -// number in internal keys, we need to use the highest-numbered -// ValueType, not the lowest). -static const ValueType kValueTypeForSeek = kTypeLargeValueRef; - -typedef uint64_t SequenceNumber; - -// We leave eight bits empty at the bottom so a type and sequence# -// can be packed together into 64-bits. -static const SequenceNumber kMaxSequenceNumber = - ((0x1ull << 56) - 1); - -struct ParsedInternalKey { - Slice user_key; - SequenceNumber sequence; - ValueType type; - - ParsedInternalKey() { } // Intentionally left uninitialized (for speed) - ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) - : user_key(u), sequence(seq), type(t) { } - std::string DebugString() const; -}; - -// Return the length of the encoding of "key". -inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { - return key.user_key.size() + 8; -} - -// Append the serialization of "key" to *result. -extern void AppendInternalKey(std::string* result, - const ParsedInternalKey& key); - -// Attempt to parse an internal key from "internal_key". On success, -// stores the parsed data in "*result", and returns true. -// -// On error, returns false, leaves "*result" in an undefined state. -extern bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result); - -// Returns the user key portion of an internal key. -inline Slice ExtractUserKey(const Slice& internal_key) { - assert(internal_key.size() >= 8); - return Slice(internal_key.data(), internal_key.size() - 8); -} - -inline ValueType ExtractValueType(const Slice& internal_key) { - assert(internal_key.size() >= 8); - const size_t n = internal_key.size(); - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; - return static_cast(c); -} - -// A comparator for internal keys that uses a specified comparator for -// the user key portion and breaks ties by decreasing sequence number. -class InternalKeyComparator : public Comparator { - private: - const Comparator* user_comparator_; - public: - explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } - virtual const char* Name() const; - virtual int Compare(const Slice& a, const Slice& b) const; - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const; - virtual void FindShortSuccessor(std::string* key) const; - - const Comparator* user_comparator() const { return user_comparator_; } - - int Compare(const InternalKey& a, const InternalKey& b) const; -}; - -// Modules in this directory should keep internal keys wrapped inside -// the following class instead of plain strings so that we do not -// incorrectly use string comparisons instead of an InternalKeyComparator. -class InternalKey { - private: - std::string rep_; - public: - InternalKey() { } // Leave rep_ as empty to indicate it is invalid - InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { - AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); - } - - void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } - Slice Encode() const { - assert(!rep_.empty()); - return rep_; - } - - Slice user_key() const { return ExtractUserKey(rep_); } - - void SetFrom(const ParsedInternalKey& p) { - rep_.clear(); - AppendInternalKey(&rep_, p); - } - - void Clear() { rep_.clear(); } -}; - -inline int InternalKeyComparator::Compare( - const InternalKey& a, const InternalKey& b) const { - return Compare(a.Encode(), b.Encode()); -} - -// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte -// uncompressed size, and a 1 byte CompressionType code. An -// encoded form of it is embedded in the filenames of large value -// files stored in the database, and the raw binary form is stored as -// the iter->value() result for values of type kTypeLargeValueRef in -// the table and log files that make up the database. -struct LargeValueRef { - char data[29]; - - // Initialize a large value ref for the given data - static LargeValueRef Make(const Slice& data, - CompressionType compression_type); - - // Initialize a large value ref from a serialized, 29-byte reference value - static LargeValueRef FromRef(const Slice& ref) { - LargeValueRef result; - assert(ref.size() == sizeof(result.data)); - memcpy(result.data, ref.data(), sizeof(result.data)); - return result; - } - - // Return the number of bytes in a LargeValueRef (not the - // number of bytes in the value referenced). - static size_t ByteSize() { return sizeof(LargeValueRef().data); } - - // Return the number of bytes in the value referenced by "*this". - uint64_t ValueSize() const { return DecodeFixed64(&data[20]); } - - CompressionType compression_type() const { - return static_cast(data[28]); - } - - bool operator==(const LargeValueRef& b) const { - return memcmp(data, b.data, sizeof(data)) == 0; - } - bool operator<(const LargeValueRef& b) const { - return memcmp(data, b.data, sizeof(data)) < 0; - } -}; - -// Convert the large value ref to a human-readable string suitable -// for embedding in a large value filename. -extern std::string LargeValueRefToFilenameString(const LargeValueRef& h); - -// Parse the large value filename string in "input" and store it in -// "*h". If successful, returns true. Otherwise returns false. -extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref); - -inline bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result) { - const size_t n = internal_key.size(); - if (n < 8) return false; - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; - result->sequence = num >> 8; - result->type = static_cast(c); - result->user_key = Slice(internal_key.data(), n - 8); - return (c <= static_cast(kTypeLargeValueRef)); -} - -} - -#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc deleted file mode 100644 index 702cbb4..0000000 --- a/db/dbformat_test.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/dbformat.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -static std::string IKey(const std::string& user_key, - uint64_t seq, - ValueType vt) { - std::string encoded; - AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); - return encoded; -} - -static std::string Shorten(const std::string& s, const std::string& l) { - std::string result = s; - InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); - return result; -} - -static std::string ShortSuccessor(const std::string& s) { - std::string result = s; - InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); - return result; -} - -static void TestKey(const std::string& key, - uint64_t seq, - ValueType vt) { - std::string encoded = IKey(key, seq, vt); - - Slice in(encoded); - ParsedInternalKey decoded("", 0, kTypeValue); - - ASSERT_TRUE(ParseInternalKey(in, &decoded)); - ASSERT_EQ(key, decoded.user_key.ToString()); - ASSERT_EQ(seq, decoded.sequence); - ASSERT_EQ(vt, decoded.type); - - ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); -} - -class FormatTest { }; - -TEST(FormatTest, InternalKey_EncodeDecode) { - const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; - const uint64_t seq[] = { - 1, 2, 3, - (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, - (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, - (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 - }; - for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { - for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { - TestKey(keys[k], seq[s], kTypeValue); - TestKey("hello", 1, kTypeDeletion); - } - } -} - -TEST(FormatTest, InternalKeyShortSeparator) { - // When user keys are same - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 99, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 101, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeDeletion))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeLargeValueRef))); - - // When user keys are misordered - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("bar", 99, kTypeValue))); - - // When user keys are different, but correctly ordered - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - Shorten(IKey("foo", 100, kTypeValue), - IKey("hello", 200, kTypeValue))); - - // When start user key is prefix of limit user key - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foobar", 200, kTypeValue))); - - // When limit user key is prefix of start user key - ASSERT_EQ(IKey("foobar", 100, kTypeValue), - Shorten(IKey("foobar", 100, kTypeValue), - IKey("foo", 200, kTypeValue))); -} - -TEST(FormatTest, InternalKeyShortestSuccessor) { - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - ShortSuccessor(IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), - ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); -} - -TEST(FormatTest, SHA1) { - // Check that we are computing the same value as sha1. - // Note that the last two numbers are the length of the input and the - // compression type. - ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr - LargeValueRefToFilenameString( - LargeValueRef::Make("hello", kNoCompression))); - ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr - LargeValueRefToFilenameString( - LargeValueRef::Make("hello", kSnappyCompression))); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/db/filename.cc b/db/filename.cc deleted file mode 100644 index d21918c..0000000 --- a/db/filename.cc +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include "db/filename.h" -#include "db/dbformat.h" -#include "leveldb/env.h" -#include "util/logging.h" - -namespace leveldb { - -static std::string MakeFileName(const std::string& name, uint64_t number, - const char* suffix) { - char buf[100]; - snprintf(buf, sizeof(buf), "/%06llu.%s", - static_cast(number), - suffix); - return name + buf; -} - -std::string LogFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "log"); -} - -std::string TableFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "sst"); -} - -std::string LargeValueFileName(const std::string& name, - const LargeValueRef& large_ref) { - std::string result = name + "/"; - result += LargeValueRefToFilenameString(large_ref); - result += ".val"; - return result; -} - -std::string DescriptorFileName(const std::string& dbname, uint64_t number) { - assert(number > 0); - char buf[100]; - snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", - static_cast(number)); - return dbname + buf; -} - -std::string CurrentFileName(const std::string& dbname) { - return dbname + "/CURRENT"; -} - -std::string LockFileName(const std::string& dbname) { - return dbname + "/LOCK"; -} - -std::string TempFileName(const std::string& dbname, uint64_t number) { - assert(number > 0); - return MakeFileName(dbname, number, "dbtmp"); -} - -std::string InfoLogFileName(const std::string& dbname) { - return dbname + "/LOG"; -} - -// Return the name of the old info log file for "dbname". -std::string OldInfoLogFileName(const std::string& dbname) { - return dbname + "/LOG.old"; -} - - -// Owned filenames have the form: -// dbname/CURRENT -// dbname/LOCK -// dbname/LOG -// dbname/LOG.old -// dbname/MANIFEST-[0-9]+ -// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val -// dbname/[0-9]+.(log|sst) -bool ParseFileName(const std::string& fname, - uint64_t* number, - LargeValueRef* large_ref, - FileType* type) { - Slice rest(fname); - if (rest == "CURRENT") { - *number = 0; - *type = kCurrentFile; - } else if (rest == "LOCK") { - *number = 0; - *type = kDBLockFile; - } else if (rest == "LOG" || rest == "LOG.old") { - *number = 0; - *type = kInfoLogFile; - } else if (rest.size() >= 4 && - Slice(rest.data() + rest.size() - 4, 4) == ".val") { - LargeValueRef h; - if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4), - &h)) { - return false; - } - *large_ref = h; - *type = kLargeValueFile; - } else if (rest.starts_with("MANIFEST-")) { - rest.remove_prefix(strlen("MANIFEST-")); - uint64_t num; - if (!ConsumeDecimalNumber(&rest, &num)) { - return false; - } - if (!rest.empty()) { - return false; - } - *type = kDescriptorFile; - *number = num; - } else { - // Avoid strtoull() to keep filename format independent of the - // current locale - uint64_t num; - if (!ConsumeDecimalNumber(&rest, &num)) { - return false; - } - Slice suffix = rest; - if (suffix == Slice(".log")) { - *type = kLogFile; - } else if (suffix == Slice(".sst")) { - *type = kTableFile; - } else if (suffix == Slice(".dbtmp")) { - *type = kTempFile; - } else { - return false; - } - *number = num; - } - return true; -} - -Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number) { - // Remove leading "dbname/" and add newline to manifest file name - std::string manifest = DescriptorFileName(dbname, descriptor_number); - Slice contents = manifest; - assert(contents.starts_with(dbname + "/")); - contents.remove_prefix(dbname.size() + 1); - std::string tmp = TempFileName(dbname, descriptor_number); - Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); - if (s.ok()) { - s = env->RenameFile(tmp, CurrentFileName(dbname)); - } - if (!s.ok()) { - env->DeleteFile(tmp); - } - return s; -} - -} diff --git a/db/filename.h b/db/filename.h deleted file mode 100644 index 81ab2fc..0000000 --- a/db/filename.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// File names used by DB code - -#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ -#define STORAGE_LEVELDB_DB_FILENAME_H_ - -#include -#include -#include "leveldb/slice.h" -#include "leveldb/status.h" -#include "port/port.h" - -namespace leveldb { - -class Env; -struct LargeValueRef; - -enum FileType { - kLogFile, - kDBLockFile, - kTableFile, - kLargeValueFile, - kDescriptorFile, - kCurrentFile, - kTempFile, - kInfoLogFile, // Either the current one, or an old one -}; - -// Return the name of the log file with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string LogFileName(const std::string& dbname, uint64_t number); - -// Return the name of the sstable with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string TableFileName(const std::string& dbname, uint64_t number); - -// Return the name of the large value file with the specified large -// value reference in the db named by "dbname". The result will be -// prefixed with "dbname". -extern std::string LargeValueFileName(const std::string& dbname, - const LargeValueRef& large_ref); - -// Return the name of the descriptor file for the db named by -// "dbname" and the specified incarnation number. The result will be -// prefixed with "dbname". -extern std::string DescriptorFileName(const std::string& dbname, - uint64_t number); - -// Return the name of the current file. This file contains the name -// of the current manifest file. The result will be prefixed with -// "dbname". -extern std::string CurrentFileName(const std::string& dbname); - -// Return the name of the lock file for the db named by -// "dbname". The result will be prefixed with "dbname". -extern std::string LockFileName(const std::string& dbname); - -// Return the name of a temporary file owned by the db named "dbname". -// The result will be prefixed with "dbname". -extern std::string TempFileName(const std::string& dbname, uint64_t number); - -// Return the name of the info log file for "dbname". -extern std::string InfoLogFileName(const std::string& dbname); - -// Return the name of the old info log file for "dbname". -extern std::string OldInfoLogFileName(const std::string& dbname); - -// If filename is a leveldb file, store the type of the file in *type. -// If *type is kLargeValueFile, then the large value reference data -// from the filename is stored in "*large_ref. For all other types of -// files, the number encoded in the filename is stored in *number. If -// the filename was successfully parsed, returns true. Else return -// false. -extern bool ParseFileName(const std::string& filename, - uint64_t* number, - LargeValueRef* large_ref, - FileType* type); - -// Make the CURRENT file point to the descriptor file with the -// specified number. -extern Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number); - - -} - -#endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/db/filename_test.cc b/db/filename_test.cc deleted file mode 100644 index 4d2a91e..0000000 --- a/db/filename_test.cc +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/filename.h" - -#include "db/dbformat.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -class FileNameTest { }; - -TEST(FileNameTest, Parse) { - Slice db; - FileType type; - uint64_t number; - LargeValueRef large_ref; - - // Successful parses - static struct { - const char* fname; - uint64_t number; - const char* large_ref; - FileType type; - } cases[] = { - { "100.log", 100, "", kLogFile }, - { "0.log", 0, "", kLogFile }, - { "0.sst", 0, "", kTableFile }, - { "CURRENT", 0, "", kCurrentFile }, - { "LOCK", 0, "", kDBLockFile }, - { "MANIFEST-2", 2, "", kDescriptorFile }, - { "MANIFEST-7", 7, "", kDescriptorFile }, - { "LOG", 0, "", kInfoLogFile }, - { "LOG.old", 0, "", kInfoLogFile }, - { "18446744073709551615.log", 18446744073709551615ull, "", - kLogFile }, - { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0, - "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile }, - { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0, - "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0", - kLargeValueFile }, - }; - for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { - std::string f = cases[i].fname; - ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f; - ASSERT_EQ(cases[i].type, type) << f; - if (type == kLargeValueFile) { - ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref)) - << f; - } else { - ASSERT_EQ(cases[i].number, number) << f; - } - } - - // Errors - static const char* errors[] = { - "", - "foo", - "foo-dx-100.log", - ".log", - "", - "manifest", - "CURREN", - "CURRENTX", - "MANIFES", - "MANIFEST", - "MANIFEST-", - "XMANIFEST-3", - "MANIFEST-3x", - "LOC", - "LOCKx", - "LO", - "LOGx", - "18446744073709551616.log", - "184467440737095516150.log", - "100", - "100.", - "100.lop", - "100.val", - ".val", - "123456789012345678901234567890123456789-12340.val", - "1234567890123456789012345678901234567-123-0.val", - "12345678901234567890123456789012345678902-100-1-.val", - // Overflow on value size - "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val", - // '03.val' is a bad compression type - "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" }; - for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { - std::string f = errors[i]; - ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f; - }; -} - -TEST(FileNameTest, Construction) { - uint64_t number; - FileType type; - LargeValueRef large_ref; - std::string fname; - - fname = CurrentFileName("foo"); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); - ASSERT_EQ(0, number); - ASSERT_EQ(kCurrentFile, type); - - fname = LockFileName("foo"); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); - ASSERT_EQ(0, number); - ASSERT_EQ(kDBLockFile, type); - - fname = LogFileName("foo", 192); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); - ASSERT_EQ(192, number); - ASSERT_EQ(kLogFile, type); - - fname = TableFileName("bar", 200); - ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); - ASSERT_EQ(200, number); - ASSERT_EQ(kTableFile, type); - - fname = DescriptorFileName("bar", 100); - ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); - ASSERT_EQ(100, number); - ASSERT_EQ(kDescriptorFile, type); - - fname = TempFileName("tmp", 999); - ASSERT_EQ("tmp/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); - ASSERT_EQ(999, number); - ASSERT_EQ(kTempFile, type); - - for (int i = 0; i <= kSnappyCompression; i++) { - CompressionType ctype = static_cast(i); - std::string value = "abcdef"; - LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype); - fname = LargeValueFileName("tmp", real_large_ref); - ASSERT_EQ("tmp/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); - ASSERT_TRUE(real_large_ref == large_ref); - ASSERT_EQ(kLargeValueFile, type); - ASSERT_EQ(large_ref.compression_type(), ctype); - } -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/db/log_format.h b/db/log_format.h deleted file mode 100644 index 137cd4a..0000000 --- a/db/log_format.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Log format information shared by reader and writer. -// See ../doc/log_format.txt for more detail. - -#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ -#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ - -namespace leveldb { -namespace log { - -enum RecordType { - // Zero is reserved for preallocated files - kZeroType = 0, - - kFullType = 1, - - // For fragments - kFirstType = 2, - kMiddleType = 3, - kLastType = 4, -}; -static const int kMaxRecordType = kLastType; - -static const int kBlockSize = 32768; - -// Header is checksum (4 bytes), type (1 byte), length (2 bytes). -static const int kHeaderSize = 4 + 1 + 2; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/db/log_reader.cc b/db/log_reader.cc deleted file mode 100644 index 75e1d28..0000000 --- a/db/log_reader.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_reader.h" - -#include -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { -namespace log { - -Reader::Reporter::~Reporter() { -} - -Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) - : file_(file), - reporter_(reporter), - checksum_(checksum), - backing_store_(new char[kBlockSize]), - buffer_(), - eof_(false) { -} - -Reader::~Reader() { - delete[] backing_store_; -} - -bool Reader::ReadRecord(Slice* record, std::string* scratch) { - scratch->clear(); - record->clear(); - bool in_fragmented_record = false; - - Slice fragment; - while (true) { - switch (ReadPhysicalRecord(&fragment)) { - case kFullType: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - } - scratch->clear(); - *record = fragment; - return true; - - case kFirstType: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - } - scratch->assign(fragment.data(), fragment.size()); - in_fragmented_record = true; - break; - - case kMiddleType: - if (!in_fragmented_record) { - ReportDrop(fragment.size(), "missing start of fragmented record"); - } else { - scratch->append(fragment.data(), fragment.size()); - } - break; - - case kLastType: - if (!in_fragmented_record) { - ReportDrop(fragment.size(), "missing start of fragmented record"); - } else { - scratch->append(fragment.data(), fragment.size()); - *record = Slice(*scratch); - return true; - } - break; - - case kEof: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - scratch->clear(); - } - return false; - - case kBadRecord: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "error in middle of record"); - in_fragmented_record = false; - scratch->clear(); - } - break; - - default: - ReportDrop( - (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), - "unknown record type"); - in_fragmented_record = false; - scratch->clear(); - break; - } - } - return false; -} - -void Reader::ReportDrop(size_t bytes, const char* reason) { - if (reporter_ != NULL) { - reporter_->Corruption(bytes, Status::Corruption(reason)); - } -} - -unsigned int Reader::ReadPhysicalRecord(Slice* result) { - while (true) { - if (buffer_.size() < kHeaderSize) { - if (!eof_) { - // Last read was a full read, so this is a trailer to skip - buffer_.clear(); - Status status = file_->Read(kBlockSize, &buffer_, backing_store_); - if (!status.ok()) { - if (reporter_ != NULL) { - reporter_->Corruption(kBlockSize, status); - } - buffer_.clear(); - eof_ = true; - return kEof; - } else if (buffer_.size() < kBlockSize) { - eof_ = true; - } - continue; - } else if (buffer_.size() == 0) { - // End of file - return kEof; - } else { - ReportDrop(buffer_.size(), "truncated record at end of file"); - buffer_.clear(); - return kEof; - } - } - - // Parse the header - const char* header = buffer_.data(); - const uint32_t a = static_cast(header[4]) & 0xff; - const uint32_t b = static_cast(header[5]) & 0xff; - const unsigned int type = header[6]; - const uint32_t length = a | (b << 8); - if (kHeaderSize + length > buffer_.size()) { - ReportDrop(buffer_.size(), "bad record length"); - buffer_.clear(); - return kBadRecord; - } - - // Check crc - if (checksum_) { - if (type == kZeroType && length == 0) { - // Skip zero length record without reporting any drops since - // such records are produced by the mmap based writing code in - // env_posix.cc that preallocates file regions. - buffer_.clear(); - return kBadRecord; - } - - uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); - uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); - if (actual_crc != expected_crc) { - // Drop the rest of the buffer since "length" itself may have - // been corrupted and if we trust it, we could find some - // fragment of a real log record that just happens to look - // like a valid log record. - ReportDrop(buffer_.size(), "checksum mismatch"); - buffer_.clear(); - return kBadRecord; - } - } - - buffer_.remove_prefix(kHeaderSize + length); - *result = Slice(header + kHeaderSize, length); - return type; - } -} - -} -} diff --git a/db/log_reader.h b/db/log_reader.h deleted file mode 100644 index baf1475..0000000 --- a/db/log_reader.h +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ -#define STORAGE_LEVELDB_DB_LOG_READER_H_ - -#include "db/log_format.h" -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class SequentialFile; - -namespace log { - -class Reader { - public: - // Interface for reporting errors. - class Reporter { - public: - virtual ~Reporter(); - - // Some corruption was detected. "size" is the approximate number - // of bytes dropped due to the corruption. - virtual void Corruption(size_t bytes, const Status& status) = 0; - }; - - // Create a reader that will return log records from "*file". - // "*file" must remain live while this Reader is in use. - // - // If "reporter" is non-NULL, it is notified whenever some data is - // dropped due to a detected corruption. "*reporter" must remain - // live while this Reader is in use. - // - // If "checksum" is true, verify checksums if available. - Reader(SequentialFile* file, Reporter* reporter, bool checksum); - - ~Reader(); - - // Read the next record into *record. Returns true if read - // successfully, false if we hit end of the input. May use - // "*scratch" as temporary storage. The contents filled in *record - // will only be valid until the next mutating operation on this - // reader or the next mutation to *scratch. - bool ReadRecord(Slice* record, std::string* scratch); - - private: - SequentialFile* const file_; - Reporter* const reporter_; - bool const checksum_; - char* const backing_store_; - Slice buffer_; - bool eof_; // Last Read() indicated EOF by returning < kBlockSize - - // Extend record types with the following special values - enum { - kEof = kMaxRecordType + 1, - kBadRecord = kMaxRecordType + 2 - }; - - // Return type, or one of the preceding special values - unsigned int ReadPhysicalRecord(Slice* result); - void ReportDrop(size_t bytes, const char* reason); - - // No copying allowed - Reader(const Reader&); - void operator=(const Reader&); -}; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/db/log_test.cc b/db/log_test.cc deleted file mode 100644 index 025a5ff..0000000 --- a/db/log_test.cc +++ /dev/null @@ -1,361 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { -namespace log { - -// Construct a string of the specified length made out of the supplied -// partial string. -static std::string BigString(const std::string& partial_string, size_t n) { - std::string result; - while (result.size() < n) { - result.append(partial_string); - } - result.resize(n); - return result; -} - -// Construct a string from a number -static std::string NumberString(int n) { - char buf[50]; - snprintf(buf, sizeof(buf), "%d.", n); - return std::string(buf); -} - -// Return a skewed potentially long string -static std::string RandomSkewedString(int i, Random* rnd) { - return BigString(NumberString(i), rnd->Skewed(17)); -} - -class LogTest { - private: - class StringDest : public WritableFile { - public: - std::string contents_; - - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } - virtual Status Append(const Slice& slice) { - contents_.append(slice.data(), slice.size()); - return Status::OK(); - } - }; - - class StringSource : public SequentialFile { - public: - Slice contents_; - bool force_error_; - bool returned_partial_; - StringSource() : force_error_(false), returned_partial_(false) { } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; - ASSERT_EQ(kBlockSize, n); - - if (force_error_) { - force_error_ = false; - returned_partial_ = true; - return Status::Corruption("read error"); - } - - if (contents_.size() < n) { - n = contents_.size(); - returned_partial_ = true; - } - *result = Slice(contents_.data(), n); - contents_.remove_prefix(n); - return Status::OK(); - } - }; - - class ReportCollector : public Reader::Reporter { - public: - size_t dropped_bytes_; - std::string message_; - - ReportCollector() : dropped_bytes_(0) { } - virtual void Corruption(size_t bytes, const Status& status) { - dropped_bytes_ += bytes; - message_.append(status.ToString()); - } - }; - - StringDest dest_; - StringSource source_; - ReportCollector report_; - bool reading_; - Writer writer_; - Reader reader_; - - public: - LogTest() : reading_(false), - writer_(&dest_), - reader_(&source_, &report_, true/*checksum*/) { - } - - void Write(const std::string& msg) { - ASSERT_TRUE(!reading_) << "Write() after starting to read"; - writer_.AddRecord(Slice(msg)); - } - - size_t WrittenBytes() const { - return dest_.contents_.size(); - } - - std::string Read() { - if (!reading_) { - reading_ = true; - source_.contents_ = Slice(dest_.contents_); - } - std::string scratch; - Slice record; - if (reader_.ReadRecord(&record, &scratch)) { - return record.ToString(); - } else { - return "EOF"; - } - } - - void IncrementByte(int offset, int delta) { - dest_.contents_[offset] += delta; - } - - void SetByte(int offset, char new_byte) { - dest_.contents_[offset] = new_byte; - } - - void ShrinkSize(int bytes) { - dest_.contents_.resize(dest_.contents_.size() - bytes); - } - - void FixChecksum(int header_offset, int len) { - // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); - crc = crc32c::Mask(crc); - EncodeFixed32(&dest_.contents_[header_offset], crc); - } - - void ForceError() { - source_.force_error_ = true; - } - - size_t DroppedBytes() const { - return report_.dropped_bytes_; - } - - // Returns OK iff recorded error message contains "msg" - std::string MatchError(const std::string& msg) const { - if (report_.message_.find(msg) == std::string::npos) { - return report_.message_; - } else { - return "OK"; - } - } -}; - -TEST(LogTest, Empty) { - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, ReadWrite) { - Write("foo"); - Write("bar"); - Write(""); - Write("xxxx"); - ASSERT_EQ("foo", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("xxxx", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ("EOF", Read()); // Make sure reads at eof work -} - -TEST(LogTest, ManyBlocks) { - for (int i = 0; i < 100000; i++) { - Write(NumberString(i)); - } - for (int i = 0; i < 100000; i++) { - ASSERT_EQ(NumberString(i), Read()); - } - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, Fragmentation) { - Write("small"); - Write(BigString("medium", 50000)); - Write(BigString("large", 100000)); - ASSERT_EQ("small", Read()); - ASSERT_EQ(BigString("medium", 50000), Read()); - ASSERT_EQ(BigString("large", 100000), Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, MarginalTrailer) { - // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); - Write(""); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, ShortTrailer) { - const int n = kBlockSize - 2*kHeaderSize + 4; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); - Write(""); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, AlignedEof) { - const int n = kBlockSize - 2*kHeaderSize + 4; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, RandomRead) { - const int N = 500; - Random write_rnd(301); - for (int i = 0; i < N; i++) { - Write(RandomSkewedString(i, &write_rnd)); - } - Random read_rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); - } - ASSERT_EQ("EOF", Read()); -} - -// Tests of all the error paths in log_reader.cc follow: - -TEST(LogTest, ReadError) { - Write("foo"); - ForceError(); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kBlockSize, DroppedBytes()); - ASSERT_EQ("OK", MatchError("read error")); -} - -TEST(LogTest, BadRecordType) { - Write("foo"); - // Type is stored in header[6] - IncrementByte(6, 100); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("unknown record type")); -} - -TEST(LogTest, TruncatedTrailingRecord) { - Write("foo"); - ShrinkSize(4); // Drop all payload as well as a header byte - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); - ASSERT_EQ("OK", MatchError("truncated record at end of file")); -} - -TEST(LogTest, BadLength) { - Write("foo"); - ShrinkSize(1); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); - ASSERT_EQ("OK", MatchError("bad record length")); -} - -TEST(LogTest, ChecksumMismatch) { - Write("foo"); - IncrementByte(0, 10); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(10, DroppedBytes()); - ASSERT_EQ("OK", MatchError("checksum mismatch")); -} - -TEST(LogTest, UnexpectedMiddleType) { - Write("foo"); - SetByte(6, kMiddleType); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("missing start")); -} - -TEST(LogTest, UnexpectedLastType) { - Write("foo"); - SetByte(6, kLastType); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("missing start")); -} - -TEST(LogTest, UnexpectedFullType) { - Write("foo"); - Write("bar"); - SetByte(6, kFirstType); - FixChecksum(0, 3); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("partial record without end")); -} - -TEST(LogTest, UnexpectedFirstType) { - Write("foo"); - Write(BigString("bar", 100000)); - SetByte(6, kFirstType); - FixChecksum(0, 3); - ASSERT_EQ(BigString("bar", 100000), Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("partial record without end")); -} - -TEST(LogTest, ErrorJoinsRecords) { - // Consider two fragmented records: - // first(R1) last(R1) first(R2) last(R2) - // where the middle two fragments disappear. We do not want - // first(R1),last(R2) to get joined and returned as a valid record. - - // Write records that span two blocks - Write(BigString("foo", kBlockSize)); - Write(BigString("bar", kBlockSize)); - Write("correct"); - - // Wipe the middle block - for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { - SetByte(offset, 'x'); - } - - ASSERT_EQ("correct", Read()); - ASSERT_EQ("EOF", Read()); - const int dropped = DroppedBytes(); - ASSERT_LE(dropped, 2*kBlockSize + 100); - ASSERT_GE(dropped, 2*kBlockSize); -} - -} -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/db/log_writer.cc b/db/log_writer.cc deleted file mode 100644 index 18ca37a..0000000 --- a/db/log_writer.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_writer.h" - -#include -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { -namespace log { - -Writer::Writer(WritableFile* dest) - : dest_(dest), - block_offset_(0) { - for (int i = 0; i <= kMaxRecordType; i++) { - char t = static_cast(i); - type_crc_[i] = crc32c::Value(&t, 1); - } -} - -Writer::~Writer() { -} - -Status Writer::AddRecord(const Slice& slice) { - const char* ptr = slice.data(); - size_t left = slice.size(); - - // Fragment the record if necessary and emit it. Note that if slice - // is empty, we still want to iterate once to emit a single - // zero-length record - Status s; - do { - const int leftover = kBlockSize - block_offset_; - assert(leftover >= 0); - if (leftover < kHeaderSize) { - // Switch to a new block - if (leftover > 0) { - // Fill the trailer (literal below relies on kHeaderSize being 7) - assert(kHeaderSize == 7); - dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); - } - block_offset_ = 0; - } - - // Invariant: we never leave < kHeaderSize bytes in a block. - const int avail = kBlockSize - block_offset_ - kHeaderSize; - assert(avail >= 0); - - const size_t fragment_length = (left < avail) ? left : avail; - - RecordType type; - const bool begin = (ptr == slice.data()); - const bool end = (left == fragment_length); - if (begin && end) { - type = kFullType; - } else if (begin) { - type = kFirstType; - } else if (end) { - type = kLastType; - } else { - type = kMiddleType; - } - - s = EmitPhysicalRecord(type, ptr, fragment_length); - ptr += fragment_length; - left -= fragment_length; - } while (s.ok() && left > 0); - return s; -} - -Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { - assert(n <= 0xffff); // Must fit in two bytes - assert(block_offset_ + kHeaderSize + n <= kBlockSize); - - // Format the header - char buf[kHeaderSize]; - buf[4] = static_cast(n & 0xff); - buf[5] = static_cast(n >> 8); - buf[6] = static_cast(t); - - // Compute the crc of the record type and the payload. - uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); - crc = crc32c::Mask(crc); // Adjust for storage - EncodeFixed32(buf, crc); - - // Write the header and the payload - Status s = dest_->Append(Slice(buf, kHeaderSize)); - if (s.ok()) { - s = dest_->Append(Slice(ptr, n)); - if (s.ok()) { - s = dest_->Flush(); - } - } - block_offset_ += kHeaderSize + n; - return s; -} - -} -} diff --git a/db/log_writer.h b/db/log_writer.h deleted file mode 100644 index d3cf27d..0000000 --- a/db/log_writer.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ -#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ - -#include -#include "db/log_format.h" -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class WritableFile; - -namespace log { - -class Writer { - public: - // Create a writer that will append data to "*dest". - // "*dest" must be initially empty. - // "*dest" must remain live while this Writer is in use. - explicit Writer(WritableFile* dest); - ~Writer(); - - Status AddRecord(const Slice& slice); - - private: - WritableFile* dest_; - int block_offset_; // Current offset in block - - // crc32c values for all supported record types. These are - // pre-computed to reduce the overhead of computing the crc of the - // record type stored in the header. - uint32_t type_crc_[kMaxRecordType + 1]; - - Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); - - // No copying allowed - Writer(const Writer&); - void operator=(const Writer&); -}; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/db/memtable.cc b/db/memtable.cc deleted file mode 100644 index a3b618a..0000000 --- a/db/memtable.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/memtable.h" -#include "db/dbformat.h" -#include "leveldb/comparator.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "util/coding.h" - -namespace leveldb { - -static Slice GetLengthPrefixedSlice(const char* data) { - uint32_t len; - const char* p = data; - p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted - return Slice(p, len); -} - -MemTable::MemTable(const InternalKeyComparator& cmp) - : comparator_(cmp), - table_(comparator_, &arena_) { -} - -MemTable::~MemTable() { -} - -size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } - -int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) - const { - // Internal keys are encoded as length-prefixed strings. - Slice a = GetLengthPrefixedSlice(aptr); - Slice b = GetLengthPrefixedSlice(bptr); - return comparator.Compare(a, b); -} - -// Encode a suitable internal key target for "target" and return it. -// Uses *scratch as scratch space, and the returned pointer will point -// into this scratch space. -static const char* EncodeKey(std::string* scratch, const Slice& target) { - scratch->clear(); - PutVarint32(scratch, target.size()); - scratch->append(target.data(), target.size()); - return scratch->data(); -} - -class MemTableIterator: public Iterator { - public: - explicit MemTableIterator(MemTable::Table* table) { - iter_ = new MemTable::Table::Iterator(table); - } - virtual ~MemTableIterator() { delete iter_; } - - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } - virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } - virtual Slice value() const { - Slice key_slice = GetLengthPrefixedSlice(iter_->key()); - return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); - } - - virtual Status status() const { return Status::OK(); } - - private: - MemTable::Table::Iterator* iter_; - std::string tmp_; // For passing to EncodeKey - - // No copying allowed - MemTableIterator(const MemTableIterator&); - void operator=(const MemTableIterator&); -}; - -Iterator* MemTable::NewIterator() { - return new MemTableIterator(&table_); -} - -void MemTable::Add(SequenceNumber s, ValueType type, - const Slice& key, - const Slice& value) { - // Format of an entry is concatenation of: - // key_size : varint32 of internal_key.size() - // key bytes : char[internal_key.size()] - // value_size : varint32 of value.size() - // value bytes : char[value.size()] - size_t key_size = key.size(); - size_t val_size = value.size(); - size_t internal_key_size = key_size + 8; - const size_t encoded_len = - VarintLength(internal_key_size) + internal_key_size + - VarintLength(val_size) + val_size; - char* buf = arena_.Allocate(encoded_len); - char* p = EncodeVarint32(buf, internal_key_size); - memcpy(p, key.data(), key_size); - p += key_size; - EncodeFixed64(p, (s << 8) | type); - p += 8; - p = EncodeVarint32(p, val_size); - memcpy(p, value.data(), val_size); - assert((p + val_size) - buf == encoded_len); - table_.Insert(buf); -} - -} diff --git a/db/memtable.h b/db/memtable.h deleted file mode 100644 index 45b3342..0000000 --- a/db/memtable.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ -#define STORAGE_LEVELDB_DB_MEMTABLE_H_ - -#include -#include "leveldb/db.h" -#include "db/dbformat.h" -#include "db/skiplist.h" -#include "util/arena.h" - -namespace leveldb { - -class InternalKeyComparator; -class Mutex; -class MemTableIterator; - -class MemTable { - public: - explicit MemTable(const InternalKeyComparator& comparator); - ~MemTable(); - - // Returns an estimate of the number of bytes of data in use by this - // data structure. - // - // REQUIRES: external synchronization to prevent simultaneous - // operations on the same MemTable. - size_t ApproximateMemoryUsage(); - - // Return an iterator that yields the contents of the memtable. - // - // The caller must ensure that the underlying MemTable remains live - // while the returned iterator is live. The keys returned by this - // iterator are internal keys encoded by AppendInternalKey in the - // db/format.{h,cc} module. - Iterator* NewIterator(); - - // Add an entry into memtable that maps key to value at the - // specified sequence number and with the specified type. - // Typically value will be empty if type==kTypeDeletion. - void Add(SequenceNumber seq, ValueType type, - const Slice& key, - const Slice& value); - - private: - struct KeyComparator { - const InternalKeyComparator comparator; - explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } - int operator()(const char* a, const char* b) const; - }; - friend class MemTableIterator; - friend class MemTableBackwardIterator; - - typedef SkipList Table; - - KeyComparator comparator_; - Arena arena_; - Table table_; - - // No copying allowed - MemTable(const MemTable&); - void operator=(const MemTable&); -}; - -} - -#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/db/repair.cc b/db/repair.cc deleted file mode 100644 index 014e00e..0000000 --- a/db/repair.cc +++ /dev/null @@ -1,396 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// We recover the contents of the descriptor from the other files we find. -// (1) Any log files are first converted to tables -// (2) We scan every table to compute -// (a) smallest/largest for the table -// (b) large value refs from the table -// (c) largest sequence number in the table -// (3) We generate descriptor contents: -// - log number is set to zero -// - next-file-number is set to 1 + largest file number we found -// - last-sequence-number is set to largest sequence# found across -// all tables (see 2c) -// - compaction pointers are cleared -// - every table file is added at level 0 -// -// Possible optimization 1: -// (a) Compute total size and use to pick appropriate max-level M -// (b) Sort tables by largest sequence# in the table -// (c) For each table: if it overlaps earlier table, place in level-0, -// else place in level-M. -// Possible optimization 2: -// Store per-table metadata (smallest, largest, largest-seq#, -// large-value-refs, ...) in the table's meta section to speed up -// ScanTable. - -#include "db/builder.h" -#include "db/db_impl.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "db/version_edit.h" -#include "db/write_batch_internal.h" -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/env.h" - -namespace leveldb { - -namespace { - -class Repairer { - public: - Repairer(const std::string& dbname, const Options& options) - : dbname_(dbname), - env_(options.env), - icmp_(options.comparator), - options_(SanitizeOptions(dbname, &icmp_, options)), - owns_info_log_(options_.info_log != options.info_log), - next_file_number_(1) { - // TableCache can be small since we expect each table to be opened once. - table_cache_ = new TableCache(dbname_, &options_, 10); - } - - ~Repairer() { - delete table_cache_; - if (owns_info_log_) { - delete options_.info_log; - } - } - - Status Run() { - Status status = FindFiles(); - if (status.ok()) { - ConvertLogFilesToTables(); - ExtractMetaData(); - status = WriteDescriptor(); - } - if (status.ok()) { - unsigned long long bytes = 0; - for (int i = 0; i < tables_.size(); i++) { - bytes += tables_[i].meta.file_size; - } - Log(env_, options_.info_log, - "**** Repaired leveldb %s; " - "recovered %d files; %llu bytes. " - "Some data may have been lost. " - "****", - dbname_.c_str(), - static_cast(tables_.size()), - bytes); - } - return status; - } - - private: - struct TableInfo { - FileMetaData meta; - SequenceNumber max_sequence; - }; - - std::string const dbname_; - Env* const env_; - InternalKeyComparator const icmp_; - Options const options_; - bool owns_info_log_; - TableCache* table_cache_; - VersionEdit edit_; - - std::vector manifests_; - std::vector table_numbers_; - std::vector logs_; - std::vector tables_; - uint64_t next_file_number_; - - Status FindFiles() { - std::vector filenames; - Status status = env_->GetChildren(dbname_, &filenames); - if (!status.ok()) { - return status; - } - if (filenames.empty()) { - return Status::IOError(dbname_, "repair found no files"); - } - - uint64_t number; - LargeValueRef large_ref; - FileType type; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type)) { - if (type == kLargeValueFile) { - // Will be picked up when we process a Table that points to it - } else if (type == kDescriptorFile) { - manifests_.push_back(filenames[i]); - } else { - if (number + 1 > next_file_number_) { - next_file_number_ = number + 1; - } - if (type == kLogFile) { - logs_.push_back(number); - } else if (type == kTableFile) { - table_numbers_.push_back(number); - } else { - // Ignore other files - } - } - } - } - return status; - } - - void ConvertLogFilesToTables() { - for (int i = 0; i < logs_.size(); i++) { - std::string logname = LogFileName(dbname_, logs_[i]); - Status status = ConvertLogToTable(logs_[i]); - if (!status.ok()) { - Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", - (unsigned long long) logs_[i], - status.ToString().c_str()); - } - ArchiveFile(logname); - } - } - - Status ConvertLogToTable(uint64_t log) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - WritableFile* info_log; - uint64_t lognum; - virtual void Corruption(size_t bytes, const Status& s) { - // We print error messages for corruption, but continue repairing. - Log(env, info_log, "Log #%llu: dropping %d bytes; %s", - (unsigned long long) lognum, - static_cast(bytes), - s.ToString().c_str()); - } - }; - - // Open the log file - std::string logname = LogFileName(dbname_, log); - SequentialFile* lfile; - Status status = env_->NewSequentialFile(logname, &lfile); - if (!status.ok()) { - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log; - reporter.lognum = log; - // We intentially make log::Reader do checksumming so that - // corruptions cause entire commits to be skipped instead of - // propagating bad information (like overly large sequence - // numbers). - log::Reader reader(lfile, &reporter, false/*do not checksum*/); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - MemTable mem(icmp_); - int counter = 0; - while (reader.ReadRecord(&record, &scratch)) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, &mem); - if (status.ok()) { - counter += WriteBatchInternal::Count(&batch); - } else { - Log(env_, options_.info_log, "Log #%llu: ignoring %s", - (unsigned long long) log, - status.ToString().c_str()); - status = Status::OK(); // Keep going with rest of file - } - } - delete lfile; - - // We ignore any version edits generated by the conversion to a Table - // since ExtractMetaData() will also generate edits. - VersionEdit skipped; - FileMetaData meta; - meta.number = next_file_number_++; - Iterator* iter = mem.NewIterator(); - status = BuildTable(dbname_, env_, options_, table_cache_, iter, - &meta, &skipped); - delete iter; - if (status.ok()) { - if (meta.file_size > 0) { - table_numbers_.push_back(meta.number); - } - } - Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", - (unsigned long long) log, - counter, - (unsigned long long) meta.number, - status.ToString().c_str()); - return status; - } - - void ExtractMetaData() { - std::vector kept; - for (int i = 0; i < table_numbers_.size(); i++) { - TableInfo t; - t.meta.number = table_numbers_[i]; - Status status = ScanTable(&t); - if (!status.ok()) { - std::string fname = TableFileName(dbname_, table_numbers_[i]); - Log(env_, options_.info_log, "Table #%llu: ignoring %s", - (unsigned long long) table_numbers_[i], - status.ToString().c_str()); - ArchiveFile(fname); - } else { - tables_.push_back(t); - } - } - } - - Status ScanTable(TableInfo* t) { - std::string fname = TableFileName(dbname_, t->meta.number); - int counter = 0; - Status status = env_->GetFileSize(fname, &t->meta.file_size); - if (status.ok()) { - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), t->meta.number, t->meta.file_size); - bool empty = true; - ParsedInternalKey parsed; - t->max_sequence = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - Slice key = iter->key(); - if (!ParseInternalKey(key, &parsed)) { - Log(env_, options_.info_log, "Table #%llu: unparsable key %s", - (unsigned long long) t->meta.number, - EscapeString(key).c_str()); - continue; - } - - counter++; - if (empty) { - empty = false; - t->meta.smallest.DecodeFrom(key); - } - t->meta.largest.DecodeFrom(key); - if (parsed.sequence > t->max_sequence) { - t->max_sequence = parsed.sequence; - } - - if (ExtractValueType(key) == kTypeLargeValueRef) { - if (iter->value().size() != LargeValueRef::ByteSize()) { - Log(env_, options_.info_log, "Table #%llu: bad large value ref", - (unsigned long long) t->meta.number); - } else { - edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()), - t->meta.number, - key); - } - } - } - if (!iter->status().ok()) { - status = iter->status(); - } - delete iter; - } - Log(env_, options_.info_log, "Table #%llu: %d entries %s", - (unsigned long long) t->meta.number, - counter, - status.ToString().c_str()); - return status; - } - - Status WriteDescriptor() { - std::string tmp = TempFileName(dbname_, 1); - WritableFile* file; - Status status = env_->NewWritableFile(tmp, &file); - if (!status.ok()) { - return status; - } - - SequenceNumber max_sequence = 0; - for (int i = 0; i < tables_.size(); i++) { - if (max_sequence < tables_[i].max_sequence) { - max_sequence = tables_[i].max_sequence; - } - } - - edit_.SetComparatorName(icmp_.user_comparator()->Name()); - edit_.SetLogNumber(0); - edit_.SetNextFile(next_file_number_); - edit_.SetLastSequence(max_sequence); - - for (int i = 0; i < tables_.size(); i++) { - // TODO(opt): separate out into multiple levels - const TableInfo& t = tables_[i]; - edit_.AddFile(0, t.meta.number, t.meta.file_size, - t.meta.smallest, t.meta.largest); - } - - //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); - { - log::Writer log(file); - std::string record; - edit_.EncodeTo(&record); - status = log.AddRecord(record); - } - if (status.ok()) { - status = file->Close(); - } - delete file; - file = NULL; - - if (!status.ok()) { - env_->DeleteFile(tmp); - } else { - // Discard older manifests - for (int i = 0; i < manifests_.size(); i++) { - ArchiveFile(dbname_ + "/" + manifests_[i]); - } - - // Install new manifest - status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); - if (status.ok()) { - status = SetCurrentFile(env_, dbname_, 1); - } else { - env_->DeleteFile(tmp); - } - } - return status; - } - - void ArchiveFile(const std::string& fname) { - // Move into another directory. E.g., for - // dir/foo - // rename to - // dir/lost/foo - const char* slash = strrchr(fname.c_str(), '/'); - std::string new_dir; - if (slash != NULL) { - new_dir.assign(fname.data(), slash - fname.data()); - } - new_dir.append("/lost"); - env_->CreateDir(new_dir); // Ignore error - std::string new_file = new_dir; - new_file.append("/"); - new_file.append((slash == NULL) ? fname.c_str() : slash + 1); - Status s = env_->RenameFile(fname, new_file); - Log(env_, options_.info_log, "Archiving %s: %s\n", - fname.c_str(), s.ToString().c_str()); - } -}; -} - -Status RepairDB(const std::string& dbname, const Options& options) { - Repairer repairer(dbname, options); - return repairer.Run(); -} - -} diff --git a/db/skiplist.h b/db/skiplist.h deleted file mode 100644 index be39354..0000000 --- a/db/skiplist.h +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Thread safety -// ------------- -// -// Writes require external synchronization, most likely a mutex. -// Reads require a guarantee that the SkipList will not be destroyed -// while the read is in progress. Apart from that, reads progress -// without any internal locking or synchronization. -// -// Invariants: -// -// (1) Allocated nodes are never deleted until the SkipList is -// destroyed. This is trivially guaranteed by the code since we -// never delete any skip list nodes. -// -// (2) The contents of a Node except for the next/prev pointers are -// immutable after the Node has been linked into the SkipList. -// Only Insert() modifies the list, and it is careful to initialize -// a node and use release-stores to publish the nodes in one or -// more lists. -// -// ... prev vs. next pointer ordering ... - -#include -#include -#include "port/port.h" -#include "util/arena.h" -#include "util/random.h" - -namespace leveldb { - -class Arena; - -template -class SkipList { - private: - struct Node; - - public: - // Create a new SkipList object that will use "cmp" for comparing keys, - // and will allocate memory using "*arena". Objects allocated in the arena - // must remain allocated for the lifetime of the skiplist object. - explicit SkipList(Comparator cmp, Arena* arena); - - // Insert key into the list. - // REQUIRES: nothing that compares equal to key is currently in the list. - void Insert(const Key& key); - - // Returns true iff an entry that compares equal to key is in the list. - bool Contains(const Key& key) const; - - // Iteration over the contents of a skip list - class Iterator { - public: - // Initialize an iterator over the specified list. - // The returned iterator is not valid. - explicit Iterator(const SkipList* list); - - // Returns true iff the iterator is positioned at a valid node. - bool Valid() const; - - // Returns the key at the current position. - // REQUIRES: Valid() - const Key& key() const; - - // Advances to the next position. - // REQUIRES: Valid() - void Next(); - - // Advances to the previous position. - // REQUIRES: Valid() - void Prev(); - - // Advance to the first entry with a key >= target - void Seek(const Key& target); - - // Position at the first entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToFirst(); - - // Position at the last entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToLast(); - - private: - const SkipList* list_; - Node* node_; - // Intentionally copyable - }; - - private: - enum { kMaxHeight = 12 }; - - // Immutable after construction - Comparator const compare_; - Arena* const arena_; // Arena used for allocations of nodes - - Node* const head_; - - // Modified only by Insert(). Read racily by readers, but stale - // values are ok. - port::AtomicPointer max_height_; // Height of the entire list - - inline int GetMaxHeight() const { - return reinterpret_cast(max_height_.NoBarrier_Load()); - } - - // Read/written only by Insert(). - Random rnd_; - - Node* NewNode(const Key& key, int height); - int RandomHeight(); - bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } - - // Return true if key is greater than the data stored in "n" - bool KeyIsAfterNode(const Key& key, Node* n) const; - - // Return the earliest node that comes at or after key. - // Return NULL if there is no such node. - // - // If prev is non-NULL, fills prev[level] with pointer to previous - // node at "level" for every level in [0..max_height_-1]. - Node* FindGreaterOrEqual(const Key& key, Node** prev) const; - - // Return the latest node with a key < key. - // Return head_ if there is no such node. - Node* FindLessThan(const Key& key) const; - - // Return the last node in the list. - // Return head_ if list is empty. - Node* FindLast() const; - - // No copying allowed - SkipList(const SkipList&); - void operator=(const SkipList&); -}; - -// Implementation details follow -template -struct SkipList::Node { - explicit Node(const Key& k) : key(k) { } - - Key const key; - - // Accessors/mutators for links. Wrapped in methods so we can - // add the appropriate barriers as necessary. - Node* Next(int n) { - assert(n >= 0); - // Use an 'acquire load' so that we observe a fully initialized - // version of the returned Node. - return reinterpret_cast(next_[n].Acquire_Load()); - } - void SetNext(int n, Node* x) { - assert(n >= 0); - // Use a 'release store' so that anybody who reads through this - // pointer observes a fully initialized version of the inserted node. - next_[n].Release_Store(x); - } - - // No-barrier variants that can be safely used in a few locations. - Node* NoBarrier_Next(int n) { - assert(n >= 0); - return reinterpret_cast(next_[n].NoBarrier_Load()); - } - void NoBarrier_SetNext(int n, Node* x) { - assert(n >= 0); - next_[n].NoBarrier_Store(x); - } - - private: - // Array of length equal to the node height. next_[0] is lowest level link. - port::AtomicPointer next_[1]; -}; - -template -typename SkipList::Node* -SkipList::NewNode(const Key& key, int height) { - char* mem = arena_->AllocateAligned( - sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); - return new (mem) Node(key); -} - -template -inline SkipList::Iterator::Iterator(const SkipList* list) { - list_ = list; - node_ = NULL; -} - -template -inline bool SkipList::Iterator::Valid() const { - return node_ != NULL; -} - -template -inline const Key& SkipList::Iterator::key() const { - assert(Valid()); - return node_->key; -} - -template -inline void SkipList::Iterator::Next() { - assert(Valid()); - node_ = node_->Next(0); -} - -template -inline void SkipList::Iterator::Prev() { - // Instead of using explicit "prev" links, we just search for the - // last node that falls before key. - assert(Valid()); - node_ = list_->FindLessThan(node_->key); - if (node_ == list_->head_) { - node_ = NULL; - } -} - -template -inline void SkipList::Iterator::Seek(const Key& target) { - node_ = list_->FindGreaterOrEqual(target, NULL); -} - -template -inline void SkipList::Iterator::SeekToFirst() { - node_ = list_->head_->Next(0); -} - -template -inline void SkipList::Iterator::SeekToLast() { - node_ = list_->FindLast(); - if (node_ == list_->head_) { - node_ = NULL; - } -} - -template -int SkipList::RandomHeight() { - // Increase height with probability 1 in kBranching - static const unsigned int kBranching = 4; - int height = 1; - while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { - height++; - } - assert(height > 0); - assert(height <= kMaxHeight); - return height; -} - -template -bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { - // NULL n is considered infinite - return (n != NULL) && (compare_(n->key, key) < 0); -} - -template -typename SkipList::Node* SkipList::FindGreaterOrEqual(const Key& key, Node** prev) - const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - Node* next = x->Next(level); - if (KeyIsAfterNode(key, next)) { - // Keep searching in this list - x = next; - } else { - if (prev != NULL) prev[level] = x; - if (level == 0) { - return next; - } else { - // Switch to next list - level--; - } - } - } -} - -template -typename SkipList::Node* -SkipList::FindLessThan(const Key& key) const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - assert(x == head_ || compare_(x->key, key) < 0); - Node* next = x->Next(level); - if (next == NULL || compare_(next->key, key) >= 0) { - if (level == 0) { - return x; - } else { - // Switch to next list - level--; - } - } else { - x = next; - } - } -} - -template -typename SkipList::Node* SkipList::FindLast() - const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - Node* next = x->Next(level); - if (next == NULL) { - if (level == 0) { - return x; - } else { - // Switch to next list - level--; - } - } else { - x = next; - } - } -} - -template -SkipList::SkipList(Comparator cmp, Arena* arena) - : compare_(cmp), - arena_(arena), - head_(NewNode(0 /* any key will do */, kMaxHeight)), - max_height_(reinterpret_cast(1)), - rnd_(0xdeadbeef) { - for (int i = 0; i < kMaxHeight; i++) { - head_->SetNext(i, NULL); - } -} - -template -void SkipList::Insert(const Key& key) { - // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() - // here since Insert() is externally synchronized. - Node* prev[kMaxHeight]; - Node* x = FindGreaterOrEqual(key, prev); - - // Our data structure does not allow duplicate insertion - assert(x == NULL || !Equal(key, x->key)); - - int height = RandomHeight(); - if (height > GetMaxHeight()) { - for (int i = GetMaxHeight(); i < height; i++) { - prev[i] = head_; - } - //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); - - // It is ok to mutate max_height_ without any synchronization - // with concurrent readers. A concurrent reader that observes - // the new value of max_height_ will see either the old value of - // new level pointers from head_ (NULL), or a new value set in - // the loop below. In the former case the reader will - // immediately drop to the next level since NULL sorts after all - // keys. In the latter case the reader will use the new node. - max_height_.NoBarrier_Store(reinterpret_cast(height)); - } - - x = NewNode(key, height); - for (int i = 0; i < height; i++) { - // NoBarrier_SetNext() suffices since we will add a barrier when - // we publish a pointer to "x" in prev[i]. - x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); - prev[i]->SetNext(i, x); - } -} - -template -bool SkipList::Contains(const Key& key) const { - Node* x = FindGreaterOrEqual(key, NULL); - if (x != NULL && Equal(key, x->key)) { - return true; - } else { - return false; - } -} - -} diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc deleted file mode 100644 index 5f9ec0d..0000000 --- a/db/skiplist_test.cc +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/skiplist.h" -#include -#include "leveldb/env.h" -#include "util/arena.h" -#include "util/hash.h" -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { - -typedef uint64_t Key; - -struct Comparator { - int operator()(const Key& a, const Key& b) const { - if (a < b) { - return -1; - } else if (a > b) { - return +1; - } else { - return 0; - } - } -}; - -class SkipTest { }; - -TEST(SkipTest, Empty) { - Arena arena; - Comparator cmp; - SkipList list(cmp, &arena); - ASSERT_TRUE(!list.Contains(10)); - - SkipList::Iterator iter(&list); - ASSERT_TRUE(!iter.Valid()); - iter.SeekToFirst(); - ASSERT_TRUE(!iter.Valid()); - iter.Seek(100); - ASSERT_TRUE(!iter.Valid()); - iter.SeekToLast(); - ASSERT_TRUE(!iter.Valid()); -} - -TEST(SkipTest, InsertAndLookup) { - const int N = 2000; - const int R = 5000; - Random rnd(1000); - std::set keys; - Arena arena; - Comparator cmp; - SkipList list(cmp, &arena); - for (int i = 0; i < N; i++) { - Key key = rnd.Next() % R; - if (keys.insert(key).second) { - list.Insert(key); - } - } - - for (int i = 0; i < R; i++) { - if (list.Contains(i)) { - ASSERT_EQ(keys.count(i), 1); - } else { - ASSERT_EQ(keys.count(i), 0); - } - } - - // Simple iterator tests - { - SkipList::Iterator iter(&list); - ASSERT_TRUE(!iter.Valid()); - - iter.Seek(0); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.begin()), iter.key()); - - iter.SeekToFirst(); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.begin()), iter.key()); - - iter.SeekToLast(); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.rbegin()), iter.key()); - } - - // Forward iteration test - for (int i = 0; i < R; i++) { - SkipList::Iterator iter(&list); - iter.Seek(i); - - // Compare against model iterator - std::set::iterator model_iter = keys.lower_bound(i); - for (int j = 0; j < 3; j++) { - if (model_iter == keys.end()) { - ASSERT_TRUE(!iter.Valid()); - break; - } else { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - ++model_iter; - iter.Next(); - } - } - } - - // Backward iteration test - { - SkipList::Iterator iter(&list); - iter.SeekToLast(); - - // Compare against model iterator - for (std::set::reverse_iterator model_iter = keys.rbegin(); - model_iter != keys.rend(); - ++model_iter) { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - iter.Prev(); - } - ASSERT_TRUE(!iter.Valid()); - } -} - -// We want to make sure that with a single writer and multiple -// concurrent readers (with no synchronization other than when a -// reader's iterator is created), the reader always observes all the -// data that was present in the skip list when the iterator was -// constructor. Because insertions are happening concurrently, we may -// also observe new values that were inserted since the iterator was -// constructed, but we should never miss any values that were present -// at iterator construction time. -// -// We generate multi-part keys: -// -// where: -// key is in range [0..K-1] -// gen is a generation number for key -// hash is hash(key,gen) -// -// The insertion code picks a random key, sets gen to be 1 + the last -// generation number inserted for that key, and sets hash to Hash(key,gen). -// -// At the beginning of a read, we snapshot the last inserted -// generation number for each key. We then iterate, including random -// calls to Next() and Seek(). For every key we encounter, we -// check that it is either expected given the initial snapshot or has -// been concurrently added since the iterator started. -class ConcurrentTest { - private: - static const uint32_t K = 4; - - static uint64_t key(Key key) { return (key >> 40); } - static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } - static uint64_t hash(Key key) { return key & 0xff; } - - static uint64_t HashNumbers(uint64_t k, uint64_t g) { - uint64_t data[2] = { k, g }; - return Hash(reinterpret_cast(data), sizeof(data), 0); - } - - static Key MakeKey(uint64_t k, uint64_t g) { - assert(sizeof(Key) == sizeof(uint64_t)); - assert(k <= K); // We sometimes pass K to seek to the end of the skiplist - assert(g <= 0xffffffffu); - return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); - } - - static bool IsValidKey(Key k) { - return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); - } - - static Key RandomTarget(Random* rnd) { - switch (rnd->Next() % 10) { - case 0: - // Seek to beginning - return MakeKey(0, 0); - case 1: - // Seek to end - return MakeKey(K, 0); - default: - // Seek to middle - return MakeKey(rnd->Next() % K, 0); - } - } - - // Per-key generation - struct State { - port::AtomicPointer generation[K]; - void Set(int k, intptr_t v) { - generation[k].Release_Store(reinterpret_cast(v)); - } - intptr_t Get(int k) { - return reinterpret_cast(generation[k].Acquire_Load()); - } - - State() { - for (int k = 0; k < K; k++) { - Set(k, 0); - } - } - }; - - // Current state of the test - State current_; - - Arena arena_; - - // SkipList is not protected by mu_. We just use a single writer - // thread to modify it. - SkipList list_; - - public: - ConcurrentTest() : list_(Comparator(), &arena_) { } - - // REQUIRES: External synchronization - void WriteStep(Random* rnd) { - const uint32_t k = rnd->Next() % K; - const intptr_t g = current_.Get(k) + 1; - const Key key = MakeKey(k, g); - list_.Insert(key); - current_.Set(k, g); - } - - void ReadStep(Random* rnd) { - // Remember the initial committed state of the skiplist. - State initial_state; - for (int k = 0; k < K; k++) { - initial_state.Set(k, current_.Get(k)); - } - - Key pos = RandomTarget(rnd); - SkipList::Iterator iter(&list_); - iter.Seek(pos); - while (true) { - Key current; - if (!iter.Valid()) { - current = MakeKey(K, 0); - } else { - current = iter.key(); - ASSERT_TRUE(IsValidKey(current)) << std::hex << current; - } - ASSERT_LE(pos, current) << "should not go backwards"; - - // Verify that everything in [pos,current) was not present in - // initial_state. - while (pos < current) { - ASSERT_LT(key(pos), K) << std::hex << pos; - - // Note that generation 0 is never inserted, so it is ok if - // <*,0,*> is missing. - ASSERT_TRUE((gen(pos) == 0) || - (gen(pos) > initial_state.Get(key(pos))) - ) << "key: " << key(pos) - << "; gen: " << gen(pos) - << "; initgen: " - << initial_state.Get(key(pos)); - - // Advance to next key in the valid key space - if (key(pos) < key(current)) { - pos = MakeKey(key(pos) + 1, 0); - } else { - pos = MakeKey(key(pos), gen(pos) + 1); - } - } - - if (!iter.Valid()) { - break; - } - - if (rnd->Next() % 2) { - iter.Next(); - pos = MakeKey(key(pos), gen(pos) + 1); - } else { - Key new_target = RandomTarget(rnd); - if (new_target > pos) { - pos = new_target; - iter.Seek(new_target); - } - } - } - } -}; -const uint32_t ConcurrentTest::K; - -// Simple test that does single-threaded testing of the ConcurrentTest -// scaffolding. -TEST(SkipTest, ConcurrentWithoutThreads) { - ConcurrentTest test; - Random rnd(test::RandomSeed()); - for (int i = 0; i < 10000; i++) { - test.ReadStep(&rnd); - test.WriteStep(&rnd); - } -} - -class TestState { - public: - ConcurrentTest t_; - int seed_; - port::AtomicPointer quit_flag_; - - enum ReaderState { - STARTING, - RUNNING, - DONE - }; - - explicit TestState(int s) - : seed_(s), - quit_flag_(NULL), - state_(STARTING), - state_cv_(&mu_) {} - - void Wait(ReaderState s) { - mu_.Lock(); - while (state_ != s) { - state_cv_.Wait(); - } - mu_.Unlock(); - } - - void Change(ReaderState s) { - mu_.Lock(); - state_ = s; - state_cv_.Signal(); - mu_.Unlock(); - } - - private: - port::Mutex mu_; - ReaderState state_; - port::CondVar state_cv_; -}; - -static void ConcurrentReader(void* arg) { - TestState* state = reinterpret_cast(arg); - Random rnd(state->seed_); - int64_t reads = 0; - state->Change(TestState::RUNNING); - while (!state->quit_flag_.Acquire_Load()) { - state->t_.ReadStep(&rnd); - ++reads; - } - state->Change(TestState::DONE); -} - -static void RunConcurrent(int run) { - const int seed = test::RandomSeed() + (run * 100); - Random rnd(seed); - const int N = 1000; - const int kSize = 1000; - for (int i = 0; i < N; i++) { - if ((i % 100) == 0) { - fprintf(stderr, "Run %d of %d\n", i, N); - } - TestState state(seed + 1); - Env::Default()->Schedule(ConcurrentReader, &state); - state.Wait(TestState::RUNNING); - for (int i = 0; i < kSize; i++) { - state.t_.WriteStep(&rnd); - } - state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do - state.Wait(TestState::DONE); - } -} - -TEST(SkipTest, Concurrent1) { RunConcurrent(1); } -TEST(SkipTest, Concurrent2) { RunConcurrent(2); } -TEST(SkipTest, Concurrent3) { RunConcurrent(3); } -TEST(SkipTest, Concurrent4) { RunConcurrent(4); } -TEST(SkipTest, Concurrent5) { RunConcurrent(5); } - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/db/snapshot.h b/db/snapshot.h deleted file mode 100644 index 9a90756..0000000 --- a/db/snapshot.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ -#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ - -#include "leveldb/db.h" - -namespace leveldb { - -class SnapshotList; - -// Snapshots are kept in a doubly-linked list in the DB. -// Each Snapshot corresponds to a particular sequence number. -class Snapshot { - public: - SequenceNumber number_; // const after creation - - private: - friend class SnapshotList; - - // Snapshot is kept in a doubly-linked circular list - Snapshot* prev_; - Snapshot* next_; - - SnapshotList* list_; // just for sanity checks -}; - -class SnapshotList { - public: - SnapshotList() { - list_.prev_ = &list_; - list_.next_ = &list_; - } - - bool empty() const { return list_.next_ == &list_; } - Snapshot* oldest() const { assert(!empty()); return list_.next_; } - Snapshot* newest() const { assert(!empty()); return list_.prev_; } - - const Snapshot* New(SequenceNumber seq) { - Snapshot* s = new Snapshot; - s->number_ = seq; - s->list_ = this; - s->next_ = &list_; - s->prev_ = list_.prev_; - s->prev_->next_ = s; - s->next_->prev_ = s; - return s; - } - - void Delete(const Snapshot* s) { - assert(s->list_ == this); - s->prev_->next_ = s->next_; - s->next_->prev_ = s->prev_; - delete s; - } - - private: - // Dummy head of doubly-linked list of snapshots - Snapshot list_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/db/table_cache.cc b/db/table_cache.cc deleted file mode 100644 index 325d707..0000000 --- a/db/table_cache.cc +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/table_cache.h" - -#include "db/filename.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "util/coding.h" - -namespace leveldb { - -struct TableAndFile { - RandomAccessFile* file; - Table* table; -}; - -static void DeleteEntry(const Slice& key, void* value) { - TableAndFile* tf = reinterpret_cast(value); - delete tf->table; - delete tf->file; - delete tf; -} - -static void UnrefEntry(void* arg1, void* arg2) { - Cache* cache = reinterpret_cast(arg1); - Cache::Handle* h = reinterpret_cast(arg2); - cache->Release(h); -} - -TableCache::TableCache(const std::string& dbname, - const Options* options, - int entries) - : env_(options->env), - dbname_(dbname), - options_(options), - cache_(NewLRUCache(entries)) { -} - -TableCache::~TableCache() { - delete cache_; -} - -Iterator* TableCache::NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr) { - if (tableptr != NULL) { - *tableptr = NULL; - } - - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - Slice key(buf, sizeof(buf)); - Cache::Handle* handle = cache_->Lookup(key); - if (handle == NULL) { - std::string fname = TableFileName(dbname_, file_number); - RandomAccessFile* file = NULL; - Table* table = NULL; - Status s = env_->NewRandomAccessFile(fname, &file); - if (s.ok()) { - s = Table::Open(*options_, file, file_size, &table); - } - - if (!s.ok()) { - assert(table == NULL); - delete file; - // We do not cache error results so that if the error is transient, - // or somebody repairs the file, we recover automatically. - return NewErrorIterator(s); - } - - TableAndFile* tf = new TableAndFile; - tf->file = file; - tf->table = table; - handle = cache_->Insert(key, tf, 1, &DeleteEntry); - } - - Table* table = reinterpret_cast(cache_->Value(handle))->table; - Iterator* result = table->NewIterator(options); - result->RegisterCleanup(&UnrefEntry, cache_, handle); - if (tableptr != NULL) { - *tableptr = table; - } - return result; -} - -void TableCache::Evict(uint64_t file_number) { - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - cache_->Erase(Slice(buf, sizeof(buf))); -} - -} diff --git a/db/table_cache.h b/db/table_cache.h deleted file mode 100644 index 5376194..0000000 --- a/db/table_cache.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Thread-safe (provides internal synchronization) - -#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ -#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ - -#include -#include -#include "db/dbformat.h" -#include "leveldb/cache.h" -#include "leveldb/table.h" -#include "port/port.h" - -namespace leveldb { - -class Env; - -class TableCache { - public: - TableCache(const std::string& dbname, const Options* options, int entries); - ~TableCache(); - - // Return an iterator for the specified file number (the corresponding - // file length must be exactly "file_size" bytes). If "tableptr" is - // non-NULL, also sets "*tableptr" to point to the Table object - // underlying the returned iterator, or NULL if no Table object underlies - // the returned iterator. The returned "*tableptr" object is owned by - // the cache and should not be deleted, and is valid for as long as the - // returned iterator is live. - Iterator* NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr = NULL); - - // Evict any entry for the specified file number - void Evict(uint64_t file_number); - - private: - Env* const env_; - const std::string dbname_; - const Options* options_; - Cache* cache_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/db/version_edit.cc b/db/version_edit.cc deleted file mode 100644 index 689dbe0..0000000 --- a/db/version_edit.cc +++ /dev/null @@ -1,301 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_edit.h" - -#include "db/version_set.h" -#include "util/coding.h" - -namespace leveldb { - -// Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. -enum Tag { - kComparator = 1, - kLogNumber = 2, - kNextFileNumber = 3, - kLastSequence = 4, - kCompactPointer = 5, - kDeletedFile = 6, - kNewFile = 7, - kLargeValueRef = 8, - kPrevLogNumber = 9, -}; - -void VersionEdit::Clear() { - comparator_.clear(); - log_number_ = 0; - prev_log_number_ = 0; - last_sequence_ = 0; - next_file_number_ = 0; - has_comparator_ = false; - has_log_number_ = false; - has_prev_log_number_ = false; - has_next_file_number_ = false; - has_last_sequence_ = false; - deleted_files_.clear(); - new_files_.clear(); - large_refs_added_.clear(); -} - -void VersionEdit::EncodeTo(std::string* dst) const { - if (has_comparator_) { - PutVarint32(dst, kComparator); - PutLengthPrefixedSlice(dst, comparator_); - } - if (has_log_number_) { - PutVarint32(dst, kLogNumber); - PutVarint64(dst, log_number_); - } - if (has_prev_log_number_) { - PutVarint32(dst, kPrevLogNumber); - PutVarint64(dst, prev_log_number_); - } - if (has_next_file_number_) { - PutVarint32(dst, kNextFileNumber); - PutVarint64(dst, next_file_number_); - } - if (has_last_sequence_) { - PutVarint32(dst, kLastSequence); - PutVarint64(dst, last_sequence_); - } - - for (int i = 0; i < compact_pointers_.size(); i++) { - PutVarint32(dst, kCompactPointer); - PutVarint32(dst, compact_pointers_[i].first); // level - PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); - } - - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { - PutVarint32(dst, kDeletedFile); - PutVarint32(dst, iter->first); // level - PutVarint64(dst, iter->second); // file number - } - - for (int i = 0; i < new_files_.size(); i++) { - const FileMetaData& f = new_files_[i].second; - PutVarint32(dst, kNewFile); - PutVarint32(dst, new_files_[i].first); // level - PutVarint64(dst, f.number); - PutVarint64(dst, f.file_size); - PutLengthPrefixedSlice(dst, f.smallest.Encode()); - PutLengthPrefixedSlice(dst, f.largest.Encode()); - } - - for (int i = 0; i < large_refs_added_.size(); i++) { - const VersionEdit::Large& l = large_refs_added_[i]; - PutVarint32(dst, kLargeValueRef); - PutLengthPrefixedSlice(dst, - Slice(l.large_ref.data, LargeValueRef::ByteSize())); - PutVarint64(dst, l.fnum); - PutLengthPrefixedSlice(dst, l.internal_key.Encode()); - } -} - -static bool GetInternalKey(Slice* input, InternalKey* dst) { - Slice str; - if (GetLengthPrefixedSlice(input, &str)) { - dst->DecodeFrom(str); - return true; - } else { - return false; - } -} - -static bool GetLevel(Slice* input, int* level) { - uint32_t v; - if (GetVarint32(input, &v) && - v < config::kNumLevels) { - *level = v; - return true; - } else { - return false; - } -} - -Status VersionEdit::DecodeFrom(const Slice& src) { - Clear(); - Slice input = src; - const char* msg = NULL; - uint32_t tag; - - // Temporary storage for parsing - int level; - uint64_t number; - FileMetaData f; - Slice str; - Large large; - InternalKey key; - - while (msg == NULL && GetVarint32(&input, &tag)) { - switch (tag) { - case kComparator: - if (GetLengthPrefixedSlice(&input, &str)) { - comparator_ = str.ToString(); - has_comparator_ = true; - } else { - msg = "comparator name"; - } - break; - - case kLogNumber: - if (GetVarint64(&input, &log_number_)) { - has_log_number_ = true; - } else { - msg = "log number"; - } - break; - - case kPrevLogNumber: - if (GetVarint64(&input, &prev_log_number_)) { - has_prev_log_number_ = true; - } else { - msg = "previous log number"; - } - break; - - case kNextFileNumber: - if (GetVarint64(&input, &next_file_number_)) { - has_next_file_number_ = true; - } else { - msg = "next file number"; - } - break; - - case kLastSequence: - if (GetVarint64(&input, &last_sequence_)) { - has_last_sequence_ = true; - } else { - msg = "last sequence number"; - } - break; - - case kCompactPointer: - if (GetLevel(&input, &level) && - GetInternalKey(&input, &key)) { - compact_pointers_.push_back(std::make_pair(level, key)); - } else { - msg = "compaction pointer"; - } - break; - - case kDeletedFile: - if (GetLevel(&input, &level) && - GetVarint64(&input, &number)) { - deleted_files_.insert(std::make_pair(level, number)); - } else { - msg = "deleted file"; - } - break; - - case kNewFile: - if (GetLevel(&input, &level) && - GetVarint64(&input, &f.number) && - GetVarint64(&input, &f.file_size) && - GetInternalKey(&input, &f.smallest) && - GetInternalKey(&input, &f.largest)) { - new_files_.push_back(std::make_pair(level, f)); - } else { - msg = "new-file entry"; - } - break; - - case kLargeValueRef: - if (GetLengthPrefixedSlice(&input, &str) && - (str.size() == LargeValueRef::ByteSize()) && - GetVarint64(&input, &large.fnum) && - GetInternalKey(&input, &large.internal_key)) { - large.large_ref = LargeValueRef::FromRef(str); - large_refs_added_.push_back(large); - } else { - msg = "large ref"; - } - break; - - default: - msg = "unknown tag"; - break; - } - } - - if (msg == NULL && !input.empty()) { - msg = "invalid tag"; - } - - Status result; - if (msg != NULL) { - result = Status::Corruption("VersionEdit", msg); - } - return result; -} - -std::string VersionEdit::DebugString() const { - std::string r; - r.append("VersionEdit {"); - if (has_comparator_) { - r.append("\n Comparator: "); - r.append(comparator_); - } - if (has_log_number_) { - r.append("\n LogNumber: "); - AppendNumberTo(&r, log_number_); - } - if (has_prev_log_number_) { - r.append("\n PrevLogNumber: "); - AppendNumberTo(&r, prev_log_number_); - } - if (has_next_file_number_) { - r.append("\n NextFile: "); - AppendNumberTo(&r, next_file_number_); - } - if (has_last_sequence_) { - r.append("\n LastSeq: "); - AppendNumberTo(&r, last_sequence_); - } - for (int i = 0; i < compact_pointers_.size(); i++) { - r.append("\n CompactPointer: "); - AppendNumberTo(&r, compact_pointers_[i].first); - r.append(" '"); - AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); - r.append("'"); - } - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { - r.append("\n DeleteFile: "); - AppendNumberTo(&r, iter->first); - r.append(" "); - AppendNumberTo(&r, iter->second); - } - for (int i = 0; i < new_files_.size(); i++) { - const FileMetaData& f = new_files_[i].second; - r.append("\n AddFile: "); - AppendNumberTo(&r, new_files_[i].first); - r.append(" "); - AppendNumberTo(&r, f.number); - r.append(" "); - AppendNumberTo(&r, f.file_size); - r.append(" '"); - AppendEscapedStringTo(&r, f.smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, f.largest.Encode()); - r.append("'"); - } - for (int i = 0; i < large_refs_added_.size(); i++) { - const VersionEdit::Large& l = large_refs_added_[i]; - r.append("\n LargeRef: "); - AppendNumberTo(&r, l.fnum); - r.append(" "); - r.append(LargeValueRefToFilenameString(l.large_ref)); - r.append(" '"); - AppendEscapedStringTo(&r, l.internal_key.Encode()); - r.append("'"); - } - r.append("\n}\n"); - return r; -} - -} diff --git a/db/version_edit.h b/db/version_edit.h deleted file mode 100644 index 7e417b5..0000000 --- a/db/version_edit.h +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ -#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ - -#include -#include -#include -#include "db/dbformat.h" - -namespace leveldb { - -class VersionSet; - -struct FileMetaData { - int refs; - uint64_t number; - uint64_t file_size; // File size in bytes - InternalKey smallest; // Smallest internal key served by table - InternalKey largest; // Largest internal key served by table - - FileMetaData() : refs(0), file_size(0) { } -}; - -class VersionEdit { - public: - VersionEdit() { Clear(); } - ~VersionEdit() { } - - void Clear(); - - void SetComparatorName(const Slice& name) { - has_comparator_ = true; - comparator_ = name.ToString(); - } - void SetLogNumber(uint64_t num) { - has_log_number_ = true; - log_number_ = num; - } - void SetPrevLogNumber(uint64_t num) { - has_prev_log_number_ = true; - prev_log_number_ = num; - } - void SetNextFile(uint64_t num) { - has_next_file_number_ = true; - next_file_number_ = num; - } - void SetLastSequence(SequenceNumber seq) { - has_last_sequence_ = true; - last_sequence_ = seq; - } - void SetCompactPointer(int level, const InternalKey& key) { - compact_pointers_.push_back(std::make_pair(level, key)); - } - - // Add the specified file at the specified number. - // REQUIRES: This version has not been saved (see VersionSet::SaveTo) - // REQUIRES: "smallest" and "largest" are smallest and largest keys in file - void AddFile(int level, uint64_t file, - uint64_t file_size, - const InternalKey& smallest, - const InternalKey& largest) { - FileMetaData f; - f.number = file; - f.file_size = file_size; - f.smallest = smallest; - f.largest = largest; - new_files_.push_back(std::make_pair(level, f)); - } - - // Delete the specified "file" from the specified "level". - void DeleteFile(int level, uint64_t file) { - deleted_files_.insert(std::make_pair(level, file)); - } - - // Record that a large value with the specified large_ref was - // written to the output file numbered "fnum" - void AddLargeValueRef(const LargeValueRef& large_ref, - uint64_t fnum, - const Slice& internal_key) { - large_refs_added_.resize(large_refs_added_.size() + 1); - Large* large = &(large_refs_added_.back()); - large->large_ref = large_ref; - large->fnum = fnum; - large->internal_key.DecodeFrom(internal_key); - } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(const Slice& src); - - std::string DebugString() const; - - private: - friend class VersionSet; - - typedef std::set< std::pair > DeletedFileSet; - - std::string comparator_; - uint64_t log_number_; - uint64_t prev_log_number_; - uint64_t next_file_number_; - SequenceNumber last_sequence_; - bool has_comparator_; - bool has_log_number_; - bool has_prev_log_number_; - bool has_next_file_number_; - bool has_last_sequence_; - - std::vector< std::pair > compact_pointers_; - DeletedFileSet deleted_files_; - std::vector< std::pair > new_files_; - struct Large { - LargeValueRef large_ref; - uint64_t fnum; - InternalKey internal_key; - }; - std::vector large_refs_added_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc deleted file mode 100644 index 6906ec3..0000000 --- a/db/version_edit_test.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_edit.h" -#include "util/testharness.h" - -namespace leveldb { - -static void TestEncodeDecode(const VersionEdit& edit) { - std::string encoded, encoded2; - edit.EncodeTo(&encoded); - VersionEdit parsed; - Status s = parsed.DecodeFrom(encoded); - ASSERT_TRUE(s.ok()) << s.ToString(); - parsed.EncodeTo(&encoded2); - ASSERT_EQ(encoded, encoded2); -} - -class VersionEditTest { }; - -TEST(VersionEditTest, EncodeDecode) { - static const uint64_t kBig = 1ull << 50; - - VersionEdit edit; - for (int i = 0; i < 4; i++) { - TestEncodeDecode(edit); - edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, - InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef), - InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); - edit.DeleteFile(4, kBig + 700 + i); - edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression), - kBig + 800 + i, "foobar"); - edit.AddLargeValueRef(LargeValueRef::Make("big2", kSnappyCompression), - kBig + 801 + i, "baz"); - edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); - } - - edit.SetComparatorName("foo"); - edit.SetLogNumber(kBig + 100); - edit.SetNextFile(kBig + 200); - edit.SetLastSequence(kBig + 1000); - TestEncodeDecode(edit); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/db/version_set.cc b/db/version_set.cc deleted file mode 100644 index 31f79bb..0000000 --- a/db/version_set.cc +++ /dev/null @@ -1,1120 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_set.h" - -#include -#include -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "leveldb/env.h" -#include "leveldb/table_builder.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -static const int kTargetFileSize = 2 * 1048576; - -// Maximum bytes of overlaps in grandparent (i.e., level+2) before we -// stop building a single file in a level->level+1 compaction. -static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; - -static double MaxBytesForLevel(int level) { - // Note: the result for level zero is not really used since we set - // the level-0 compaction threshold based on number of files. - double result = 10 * 1048576.0; // Result for both level-0 and level-1 - while (level > 1) { - result *= 10; - level--; - } - return result; -} - -static uint64_t MaxFileSizeForLevel(int level) { - return kTargetFileSize; // We could vary per level to reduce number of files? -} - -namespace { -std::string IntSetToString(const std::set& s) { - std::string result = "{"; - for (std::set::const_iterator it = s.begin(); - it != s.end(); - ++it) { - result += (result.size() > 1) ? "," : ""; - result += NumberToString(*it); - } - result += "}"; - return result; -} -} - -Version::~Version() { - assert(refs_ == 0); - for (int level = 0; level < config::kNumLevels; level++) { - for (int i = 0; i < files_[level].size(); i++) { - FileMetaData* f = files_[level][i]; - assert(f->refs >= 0); - f->refs--; - if (f->refs <= 0) { - delete f; - } - } - } - delete cleanup_mem_; -} - -// An internal iterator. For a given version/level pair, yields -// information about the files in the level. For a given entry, key() -// is the largest key that occurs in the file, and value() is an -// 16-byte value containing the file number and file size, both -// encoded using EncodeFixed64. -class Version::LevelFileNumIterator : public Iterator { - public: - LevelFileNumIterator(const Version* version, - const std::vector* flist) - : icmp_(version->vset_->icmp_.user_comparator()), - flist_(flist), - index_(flist->size()) { // Marks as invalid - } - virtual bool Valid() const { - return index_ < flist_->size(); - } - virtual void Seek(const Slice& target) { - uint32_t left = 0; - uint32_t right = flist_->size() - 1; - while (left < right) { - uint32_t mid = (left + right) / 2; - int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target); - if (cmp < 0) { - // Key at "mid.largest" is < than "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - index_ = left; - } - virtual void SeekToFirst() { index_ = 0; } - virtual void SeekToLast() { - index_ = flist_->empty() ? 0 : flist_->size() - 1; - } - virtual void Next() { - assert(Valid()); - index_++; - } - virtual void Prev() { - assert(Valid()); - if (index_ == 0) { - index_ = flist_->size(); // Marks as invalid - } else { - index_--; - } - } - Slice key() const { - assert(Valid()); - return (*flist_)[index_]->largest.Encode(); - } - Slice value() const { - assert(Valid()); - EncodeFixed64(value_buf_, (*flist_)[index_]->number); - EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); - return Slice(value_buf_, sizeof(value_buf_)); - } - virtual Status status() const { return Status::OK(); } - private: - const InternalKeyComparator icmp_; - const std::vector* const flist_; - int index_; - - // Backing store for value(). Holds the file number and size. - mutable char value_buf_[16]; -}; - -static Iterator* GetFileIterator(void* arg, - const ReadOptions& options, - const Slice& file_value) { - TableCache* cache = reinterpret_cast(arg); - if (file_value.size() != 16) { - return NewErrorIterator( - Status::Corruption("FileReader invoked with unexpected value")); - } else { - return cache->NewIterator(options, - DecodeFixed64(file_value.data()), - DecodeFixed64(file_value.data() + 8)); - } -} - -Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, - int level) const { - return NewTwoLevelIterator( - new LevelFileNumIterator(this, &files_[level]), - &GetFileIterator, vset_->table_cache_, options); -} - -void Version::AddIterators(const ReadOptions& options, - std::vector* iters) { - // Merge all level zero files together since they may overlap - for (int i = 0; i < files_[0].size(); i++) { - iters->push_back( - vset_->table_cache_->NewIterator( - options, files_[0][i]->number, files_[0][i]->file_size)); - } - - // For levels > 0, we can use a concatenating iterator that sequentially - // walks through the non-overlapping files in the level, opening them - // lazily. - for (int level = 1; level < config::kNumLevels; level++) { - if (!files_[level].empty()) { - iters->push_back(NewConcatenatingIterator(options, level)); - } - } -} - -void Version::Ref() { - ++refs_; -} - -void Version::Unref() { - assert(refs_ >= 1); - --refs_; - if (refs_ == 0) { - vset_->MaybeDeleteOldVersions(); - // TODO: try to delete obsolete files - } -} - -std::string Version::DebugString() const { - std::string r; - for (int level = 0; level < config::kNumLevels; level++) { - // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g'] - r.append("level "); - AppendNumberTo(&r, level); - r.push_back(':'); - const std::vector& files = files_[level]; - for (int i = 0; i < files.size(); i++) { - r.push_back(' '); - AppendNumberTo(&r, files[i]->number); - r.push_back(':'); - AppendNumberTo(&r, files[i]->file_size); - r.append("['"); - AppendEscapedStringTo(&r, files[i]->smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, files[i]->largest.Encode()); - r.append("']"); - } - r.push_back('\n'); - } - return r; -} - -// A helper class so we can efficiently apply a whole sequence -// of edits to a particular state without creating intermediate -// Versions that contain full copies of the intermediate state. -class VersionSet::Builder { - private: - typedef std::map FileMap; - VersionSet* vset_; - FileMap files_[config::kNumLevels]; - - public: - // Initialize a builder with the files from *base and other info from *vset - Builder(VersionSet* vset, Version* base) - : vset_(vset) { - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = base->files_[level]; - for (int i = 0; i < files.size(); i++) { - FileMetaData* f = files[i]; - f->refs++; - files_[level].insert(std::make_pair(f->number, f)); - } - } - } - - ~Builder() { - for (int level = 0; level < config::kNumLevels; level++) { - const FileMap& fmap = files_[level]; - for (FileMap::const_iterator iter = fmap.begin(); - iter != fmap.end(); - ++iter) { - FileMetaData* f = iter->second; - f->refs--; - if (f->refs <= 0) { - delete f; - } - } - } - } - - // Apply all of the edits in *edit to the current state. - void Apply(VersionEdit* edit) { - // Update compaction pointers - for (int i = 0; i < edit->compact_pointers_.size(); i++) { - const int level = edit->compact_pointers_[i].first; - vset_->compact_pointer_[level] = - edit->compact_pointers_[i].second.Encode().ToString(); - } - - // Delete files - const VersionEdit::DeletedFileSet& del = edit->deleted_files_; - for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); - iter != del.end(); - ++iter) { - const int level = iter->first; - const uint64_t number = iter->second; - FileMap::iterator fiter = files_[level].find(number); - assert(fiter != files_[level].end()); // Sanity check for debug mode - if (fiter != files_[level].end()) { - FileMetaData* f = fiter->second; - f->refs--; - if (f->refs <= 0) { - delete f; - } - files_[level].erase(fiter); - } - } - - // Add new files - for (int i = 0; i < edit->new_files_.size(); i++) { - const int level = edit->new_files_[i].first; - FileMetaData* f = new FileMetaData(edit->new_files_[i].second); - f->refs = 1; - assert(files_[level].count(f->number) == 0); - files_[level].insert(std::make_pair(f->number, f)); - } - - // Add large value refs - for (int i = 0; i < edit->large_refs_added_.size(); i++) { - const VersionEdit::Large& l = edit->large_refs_added_[i]; - vset_->RegisterLargeValueRef(l.large_ref, l.fnum, l.internal_key); - } - } - - // Save the current state in *v. - void SaveTo(Version* v) { - for (int level = 0; level < config::kNumLevels; level++) { - const FileMap& fmap = files_[level]; - for (FileMap::const_iterator iter = fmap.begin(); - iter != fmap.end(); - ++iter) { - FileMetaData* f = iter->second; - f->refs++; - v->files_[level].push_back(f); - } - } - } -}; - -VersionSet::VersionSet(const std::string& dbname, - const Options* options, - TableCache* table_cache, - const InternalKeyComparator* cmp) - : env_(options->env), - dbname_(dbname), - options_(options), - table_cache_(table_cache), - icmp_(*cmp), - next_file_number_(2), - manifest_file_number_(0), // Filled by Recover() - last_sequence_(0), - log_number_(0), - prev_log_number_(0), - descriptor_file_(NULL), - descriptor_log_(NULL), - current_(new Version(this)), - oldest_(current_) { -} - -VersionSet::~VersionSet() { - for (Version* v = oldest_; v != NULL; ) { - Version* next = v->next_; - assert(v->refs_ == 0); - delete v; - v = next; - } - delete descriptor_log_; - delete descriptor_file_; -} - -Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { - if (edit->has_log_number_) { - assert(edit->log_number_ >= log_number_); - assert(edit->log_number_ < next_file_number_); - } else { - edit->SetLogNumber(log_number_); - } - - if (!edit->has_prev_log_number_) { - edit->SetPrevLogNumber(prev_log_number_); - } - - edit->SetNextFile(next_file_number_); - edit->SetLastSequence(last_sequence_); - - Version* v = new Version(this); - { - Builder builder(this, current_); - builder.Apply(edit); - builder.SaveTo(v); - } - - std::string new_manifest_file; - Status s = Finalize(v); - - // Initialize new descriptor log file if necessary by creating - // a temporary file that contains a snapshot of the current version. - if (s.ok()) { - if (descriptor_log_ == NULL) { - assert(descriptor_file_ == NULL); - new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); - edit->SetNextFile(next_file_number_); - s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); - if (s.ok()) { - descriptor_log_ = new log::Writer(descriptor_file_); - s = WriteSnapshot(descriptor_log_); - } - } - } - - // Write new record to MANIFEST log - if (s.ok()) { - std::string record; - edit->EncodeTo(&record); - s = descriptor_log_->AddRecord(record); - if (s.ok()) { - s = descriptor_file_->Sync(); - } - } - - // If we just created a new descriptor file, install it by writing a - // new CURRENT file that points to it. - if (s.ok() && !new_manifest_file.empty()) { - s = SetCurrentFile(env_, dbname_, manifest_file_number_); - } - - // Install the new version - if (s.ok()) { - assert(current_->next_ == NULL); - assert(current_->cleanup_mem_ == NULL); - current_->cleanup_mem_ = cleanup_mem; - v->next_ = NULL; - current_->next_ = v; - current_ = v; - log_number_ = edit->log_number_; - prev_log_number_ = edit->prev_log_number_; - } else { - delete v; - if (!new_manifest_file.empty()) { - delete descriptor_log_; - delete descriptor_file_; - descriptor_log_ = NULL; - descriptor_file_ = NULL; - env_->DeleteFile(new_manifest_file); - } - } - - return s; -} - -Status VersionSet::Recover() { - struct LogReporter : public log::Reader::Reporter { - Status* status; - virtual void Corruption(size_t bytes, const Status& s) { - if (this->status->ok()) *this->status = s; - } - }; - - // Read "CURRENT" file, which contains a pointer to the current manifest file - std::string current; - Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); - if (!s.ok()) { - return s; - } - if (current.empty() || current[current.size()-1] != '\n') { - return Status::Corruption("CURRENT file does not end with newline"); - } - current.resize(current.size() - 1); - - std::string dscname = dbname_ + "/" + current; - SequentialFile* file; - s = env_->NewSequentialFile(dscname, &file); - if (!s.ok()) { - return s; - } - - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t prev_log_number = 0; - Builder builder(this, current_); - - { - LogReporter reporter; - reporter.status = &s; - log::Reader reader(file, &reporter, true/*checksum*/); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (s.ok()) { - if (edit.has_comparator_ && - edit.comparator_ != icmp_.user_comparator()->Name()) { - s = Status::InvalidArgument( - edit.comparator_ + "does not match existing comparator ", - icmp_.user_comparator()->Name()); - } - } - - if (s.ok()) { - builder.Apply(&edit); - } - - if (edit.has_log_number_) { - log_number = edit.log_number_; - have_log_number = true; - } - - if (edit.has_prev_log_number_) { - prev_log_number = edit.prev_log_number_; - have_prev_log_number = true; - } - - if (edit.has_next_file_number_) { - next_file = edit.next_file_number_; - have_next_file = true; - } - - if (edit.has_last_sequence_) { - last_sequence = edit.last_sequence_; - have_last_sequence = true; - } - } - } - delete file; - file = NULL; - - if (s.ok()) { - if (!have_next_file) { - s = Status::Corruption("no meta-nextfile entry in descriptor"); - } else if (!have_log_number) { - s = Status::Corruption("no meta-lognumber entry in descriptor"); - } else if (!have_last_sequence) { - s = Status::Corruption("no last-sequence-number entry in descriptor"); - } - - if (!have_prev_log_number) { - prev_log_number = 0; - } - } - - if (s.ok()) { - Version* v = new Version(this); - builder.SaveTo(v); - s = Finalize(v); - if (!s.ok()) { - delete v; - } else { - // Install recovered version - v->next_ = NULL; - current_->next_ = v; - current_ = v; - manifest_file_number_ = next_file; - next_file_number_ = next_file + 1; - last_sequence_ = last_sequence; - log_number_ = log_number; - prev_log_number_ = prev_log_number; - } - } - - return s; -} - -static int64_t TotalFileSize(const std::vector& files) { - int64_t sum = 0; - for (int i = 0; i < files.size(); i++) { - sum += files[i]->file_size; - } - return sum; -} - -Status VersionSet::Finalize(Version* v) { - // Precomputed best level for next compaction - int best_level = -1; - double best_score = -1; - - Status s; - for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { - s = SortLevel(v, level); - - double score; - if (level == 0) { - // We treat level-0 specially by bounding the number of files - // instead of number of bytes for two reasons: - // - // (1) With larger write-buffer sizes, it is nice not to do too - // many level-0 compactions. - // - // (2) The files in level-0 are merged on every read and - // therefore we wish to avoid too many files when the individual - // file size is small (perhaps because of a small write-buffer - // setting, or very high compression ratios, or lots of - // overwrites/deletions). - score = v->files_[level].size() / 4.0; - } else { - // Compute the ratio of current size to size limit. - const uint64_t level_bytes = TotalFileSize(v->files_[level]); - score = static_cast(level_bytes) / MaxBytesForLevel(level); - } - - if (score > best_score) { - best_level = level; - best_score = score; - } - } - - v->compaction_level_ = best_level; - v->compaction_score_ = best_score; - return s; -} - -Status VersionSet::WriteSnapshot(log::Writer* log) { - // TODO: Break up into multiple records to reduce memory usage on recovery? - - // Save metadata - VersionEdit edit; - edit.SetComparatorName(icmp_.user_comparator()->Name()); - - // Save compaction pointers - for (int level = 0; level < config::kNumLevels; level++) { - if (!compact_pointer_[level].empty()) { - InternalKey key; - key.DecodeFrom(compact_pointer_[level]); - edit.SetCompactPointer(level, key); - } - } - - // Save files - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = current_->files_[level]; - for (int i = 0; i < files.size(); i++) { - const FileMetaData* f = files[i]; - edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); - } - } - - // Save large value refs - for (LargeValueMap::const_iterator it = large_value_refs_.begin(); - it != large_value_refs_.end(); - ++it) { - const LargeValueRef& ref = it->first; - const LargeReferencesSet& pointers = it->second; - for (LargeReferencesSet::const_iterator j = pointers.begin(); - j != pointers.end(); - ++j) { - edit.AddLargeValueRef(ref, j->first, j->second); - } - } - - std::string record; - edit.EncodeTo(&record); - return log->AddRecord(record); -} - -// Helper to sort by tables_[file_number].smallest -struct VersionSet::BySmallestKey { - const InternalKeyComparator* internal_comparator; - - bool operator()(FileMetaData* f1, FileMetaData* f2) const { - return internal_comparator->Compare(f1->smallest, f2->smallest) < 0; - } -}; - -Status VersionSet::SortLevel(Version* v, uint64_t level) { - Status result; - BySmallestKey cmp; - cmp.internal_comparator = &icmp_; - std::sort(v->files_[level].begin(), v->files_[level].end(), cmp); - - if (result.ok() && level > 0) { - // There should be no overlap - for (int i = 1; i < v->files_[level].size(); i++) { - const InternalKey& prev_end = v->files_[level][i-1]->largest; - const InternalKey& this_begin = v->files_[level][i]->smallest; - if (icmp_.Compare(prev_end, this_begin) >= 0) { - result = Status::Corruption( - "overlapping ranges in same level", - (EscapeString(prev_end.Encode()) + " vs. " + - EscapeString(this_begin.Encode()))); - break; - } - } - } - return result; -} - -int VersionSet::NumLevelFiles(int level) const { - assert(level >= 0); - assert(level < config::kNumLevels); - return current_->files_[level].size(); -} - -uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { - uint64_t result = 0; - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = v->files_[level]; - for (int i = 0; i < files.size(); i++) { - if (icmp_.Compare(files[i]->largest, ikey) <= 0) { - // Entire file is before "ikey", so just add the file size - result += files[i]->file_size; - } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { - // Entire file is after "ikey", so ignore - if (level > 0) { - // Files other than level 0 are sorted by meta->smallest, so - // no further files in this level will contain data for - // "ikey". - break; - } - } else { - // "ikey" falls in the range for this table. Add the - // approximate offset of "ikey" within the table. - Table* tableptr; - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); - if (tableptr != NULL) { - result += tableptr->ApproximateOffsetOf(ikey.Encode()); - } - delete iter; - } - } - } - - // Add in large value files which are references from internal keys - // stored in the table files - // - // TODO(opt): this is O(# large values in db). If this becomes too slow, - // we could store an auxiliary data structure indexed by internal key - for (LargeValueMap::const_iterator it = large_value_refs_.begin(); - it != large_value_refs_.end(); - ++it) { - const LargeValueRef& lref = it->first; - for (LargeReferencesSet::const_iterator it2 = it->second.begin(); - it2 != it->second.end(); - ++it2) { - if (icmp_.Compare(it2->second, ikey.Encode()) <= 0) { - // Internal key for large value is before our key of interest - result += lref.ValueSize(); - } - } - } - - - return result; -} - -bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref, - uint64_t fnum, - const InternalKey& internal_key) { - LargeReferencesSet* refs = &large_value_refs_[large_ref]; - bool is_first = refs->empty(); - refs->insert(make_pair(fnum, internal_key.Encode().ToString())); - return is_first; -} - -void VersionSet::CleanupLargeValueRefs(const std::set& live_tables) { - for (LargeValueMap::iterator it = large_value_refs_.begin(); - it != large_value_refs_.end(); - ) { - LargeReferencesSet* refs = &it->second; - for (LargeReferencesSet::iterator ref_it = refs->begin(); - ref_it != refs->end(); - ) { - if (ref_it->first != log_number_ && // Not in log file - ref_it->first != prev_log_number_ && // Not in prev log - live_tables.count(ref_it->first) == 0) { // Not in a live table - // No longer live: erase - LargeReferencesSet::iterator to_erase = ref_it; - ++ref_it; - refs->erase(to_erase); - } else { - // Still live: leave this reference alone - ++ref_it; - } - } - if (refs->empty()) { - // No longer any live references to this large value: remove from - // large_value_refs - Log(env_, options_->info_log, "large value is dead: '%s'", - LargeValueRefToFilenameString(it->first).c_str()); - LargeValueMap::iterator to_erase = it; - ++it; - large_value_refs_.erase(to_erase); - } else { - ++it; - } - } -} - -bool VersionSet::LargeValueIsLive(const LargeValueRef& large_ref) { - LargeValueMap::iterator it = large_value_refs_.find(large_ref); - if (it == large_value_refs_.end()) { - return false; - } else { - assert(!it->second.empty()); - return true; - } -} - -void VersionSet::MaybeDeleteOldVersions() { - // Note: it is important to delete versions in order since a newer - // version with zero refs may be holding a pointer to a memtable - // that is used by somebody who has a ref on an older version. - while (oldest_ != current_ && oldest_->refs_ == 0) { - Version* next = oldest_->next_; - delete oldest_; - oldest_ = next; - } -} - -void VersionSet::AddLiveFiles(std::set* live) { - for (Version* v = oldest_; v != NULL; v = v->next_) { - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = v->files_[level]; - for (int i = 0; i < files.size(); i++) { - live->insert(files[i]->number); - } - } - } -} - -int64_t VersionSet::NumLevelBytes(int level) const { - assert(level >= 0); - assert(level < config::kNumLevels); - return TotalFileSize(current_->files_[level]); -} - -int64_t VersionSet::MaxNextLevelOverlappingBytes() { - int64_t result = 0; - std::vector overlaps; - for (int level = 0; level < config::kNumLevels - 1; level++) { - for (int i = 0; i < current_->files_[level].size(); i++) { - const FileMetaData* f = current_->files_[level][i]; - GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); - const int64_t sum = TotalFileSize(overlaps); - if (sum > result) { - result = sum; - } - } - } - return result; -} - -// Store in "*inputs" all files in "level" that overlap [begin,end] -void VersionSet::GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs) { - inputs->clear(); - Slice user_begin = begin.user_key(); - Slice user_end = end.user_key(); - const Comparator* user_cmp = icmp_.user_comparator(); - for (int i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || - user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { - // Either completely before or after range; skip it - } else { - inputs->push_back(f); - } - } -} - -// Stores the minimal range that covers all entries in inputs in -// *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest) { - assert(!inputs.empty()); - smallest->Clear(); - largest->Clear(); - for (int i = 0; i < inputs.size(); i++) { - FileMetaData* f = inputs[i]; - if (i == 0) { - *smallest = f->smallest; - *largest = f->largest; - } else { - if (icmp_.Compare(f->smallest, *smallest) < 0) { - *smallest = f->smallest; - } - if (icmp_.Compare(f->largest, *largest) > 0) { - *largest = f->largest; - } - } - } -} - -// Stores the minimal range that covers all entries in inputs1 and inputs2 -// in *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange2(const std::vector& inputs1, - const std::vector& inputs2, - InternalKey* smallest, - InternalKey* largest) { - std::vector all = inputs1; - all.insert(all.end(), inputs2.begin(), inputs2.end()); - GetRange(all, smallest, largest); -} - -Iterator* VersionSet::MakeInputIterator(Compaction* c) { - ReadOptions options; - options.verify_checksums = options_->paranoid_checks; - options.fill_cache = false; - - // Level-0 files have to be merged together. For other levels, - // we will make a concatenating iterator per level. - // TODO(opt): use concatenating iterator for level-0 if there is no overlap - const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); - Iterator** list = new Iterator*[space]; - int num = 0; - for (int which = 0; which < 2; which++) { - if (!c->inputs_[which].empty()) { - if (c->level() + which == 0) { - const std::vector& files = c->inputs_[which]; - for (int i = 0; i < files.size(); i++) { - list[num++] = table_cache_->NewIterator( - options, files[i]->number, files[i]->file_size); - } - } else { - // Create concatenating iterator for the files from this level - list[num++] = NewTwoLevelIterator( - new Version::LevelFileNumIterator( - c->input_version_, &c->inputs_[which]), - &GetFileIterator, table_cache_, options); - } - } - } - assert(num <= space); - Iterator* result = NewMergingIterator(&icmp_, list, num); - delete[] list; - return result; -} - -Compaction* VersionSet::PickCompaction() { - if (!NeedsCompaction()) { - return NULL; - } - const int level = current_->compaction_level_; - assert(level >= 0); - assert(level+1 < config::kNumLevels); - - Compaction* c = new Compaction(level); - c->input_version_ = current_; - c->input_version_->Ref(); - - // Pick the first file that comes after compact_pointer_[level] - for (int i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (compact_pointer_[level].empty() || - icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { - c->inputs_[0].push_back(f); - break; - } - } - if (c->inputs_[0].empty()) { - // Wrap-around to the beginning of the key space - c->inputs_[0].push_back(current_->files_[level][0]); - } - - // Files in level 0 may overlap each other, so pick up all overlapping ones - if (level == 0) { - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - // Note that the next call will discard the file we placed in - // c->inputs_[0] earlier and replace it with an overlapping set - // which will include the picked file. - GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); - assert(!c->inputs_[0].empty()); - } - - SetupOtherInputs(c); - - return c; -} - -void VersionSet::SetupOtherInputs(Compaction* c) { - const int level = c->level(); - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - - GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); - - // Get entire range covered by compaction - InternalKey all_start, all_limit; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - - // See if we can grow the number of inputs in "level" without - // changing the number of "level+1" files we pick up. - if (!c->inputs_[1].empty()) { - std::vector expanded0; - GetOverlappingInputs(level, all_start, all_limit, &expanded0); - if (expanded0.size() > c->inputs_[0].size()) { - InternalKey new_start, new_limit; - GetRange(expanded0, &new_start, &new_limit); - std::vector expanded1; - GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); - if (expanded1.size() == c->inputs_[1].size()) { - Log(env_, options_->info_log, - "Expanding@%d %d+%d to %d+%d\n", - level, - int(c->inputs_[0].size()), - int(c->inputs_[1].size()), - int(expanded0.size()), - int(expanded1.size())); - smallest = new_start; - largest = new_limit; - c->inputs_[0] = expanded0; - c->inputs_[1] = expanded1; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - } - } - } - - // Compute the set of grandparent files that overlap this compaction - // (parent == level+1; grandparent == level+2) - if (level + 2 < config::kNumLevels) { - GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); - } - - if (false) { - Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", - level, - EscapeString(smallest.Encode()).c_str(), - EscapeString(largest.Encode()).c_str()); - } - - // Update the place where we will do the next compaction for this level. - // We update this immediately instead of waiting for the VersionEdit - // to be applied so that if the compaction fails, we will try a different - // key range next time. - compact_pointer_[level] = largest.Encode().ToString(); - c->edit_.SetCompactPointer(level, largest); -} - -Compaction* VersionSet::CompactRange( - int level, - const InternalKey& begin, - const InternalKey& end) { - std::vector inputs; - GetOverlappingInputs(level, begin, end, &inputs); - if (inputs.empty()) { - return NULL; - } - - Compaction* c = new Compaction(level); - c->input_version_ = current_; - c->input_version_->Ref(); - c->inputs_[0] = inputs; - SetupOtherInputs(c); - return c; -} - -Compaction::Compaction(int level) - : level_(level), - max_output_file_size_(MaxFileSizeForLevel(level)), - input_version_(NULL), - grandparent_index_(0), - seen_key_(false), - overlapped_bytes_(0) { - for (int i = 0; i < config::kNumLevels; i++) { - level_ptrs_[i] = 0; - } -} - -Compaction::~Compaction() { - if (input_version_ != NULL) { - input_version_->Unref(); - } -} - -bool Compaction::IsTrivialMove() const { - // Avoid a move if there is lots of overlapping grandparent data. - // Otherwise, the move could create a parent file that will require - // a very expensive merge later on. - return (num_input_files(0) == 1 && - num_input_files(1) == 0 && - TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes); -} - -void Compaction::AddInputDeletions(VersionEdit* edit) { - for (int which = 0; which < 2; which++) { - for (int i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level_ + which, inputs_[which][i]->number); - } - } -} - -bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { - const std::vector& files = input_version_->files_[lvl]; - for (; level_ptrs_[lvl] < files.size(); ) { - FileMetaData* f = files[level_ptrs_[lvl]]; - if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { - // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { - // Key falls in this file's range, so definitely not base level - return false; - } - break; - } - level_ptrs_[lvl]++; - } - } - return true; -} - -bool Compaction::ShouldStopBefore(const InternalKey& key) { - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; - } - grandparent_index_++; - } - seen_key_ = true; - - if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; - } else { - return false; - } -} - -void Compaction::ReleaseInputs() { - if (input_version_ != NULL) { - input_version_->Unref(); - input_version_ = NULL; - } -} - -} diff --git a/db/version_set.h b/db/version_set.h deleted file mode 100644 index e1c5a4b..0000000 --- a/db/version_set.h +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// The representation of a DBImpl consists of a set of Versions. The -// newest version is called "current". Older versions may be kept -// around to provide a consistent view to live iterators. -// -// Each Version keeps track of a set of Table files per level. The -// entire set of versions is maintained in a VersionSet. -// -// Version,VersionSet are thread-compatible, but require external -// synchronization on all accesses. - -#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ -#define STORAGE_LEVELDB_DB_VERSION_SET_H_ - -#include -#include -#include -#include "db/dbformat.h" -#include "db/version_edit.h" -#include "port/port.h" - -namespace leveldb { - -namespace log { class Writer; } - -class Compaction; -class Iterator; -class MemTable; -class TableBuilder; -class TableCache; -class Version; -class VersionSet; -class WritableFile; - -class Version { - public: - // Append to *iters a sequence of iterators that will - // yield the contents of this Version when merged together. - // REQUIRES: This version has been saved (see VersionSet::SaveTo) - void AddIterators(const ReadOptions&, std::vector* iters); - - // Reference count management (so Versions do not disappear out from - // under live iterators) - void Ref(); - void Unref(); - - // Return a human readable string that describes this version's contents. - std::string DebugString() const; - - private: - friend class Compaction; - friend class VersionSet; - - class LevelFileNumIterator; - Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; - - VersionSet* vset_; // VersionSet to which this Version belongs - Version* next_; // Next version in linked list - int refs_; // Number of live refs to this version - MemTable* cleanup_mem_; // NULL, or table to delete when version dropped - - // List of files per level - std::vector files_[config::kNumLevels]; - - // Level that should be compacted next and its compaction score. - // Score < 1 means compaction is not strictly needed. These fields - // are initialized by Finalize(). - double compaction_score_; - int compaction_level_; - - explicit Version(VersionSet* vset) - : vset_(vset), next_(NULL), refs_(0), - cleanup_mem_(NULL), - compaction_score_(-1), - compaction_level_(-1) { - } - - ~Version(); - - // No copying allowed - Version(const Version&); - void operator=(const Version&); -}; - -class VersionSet { - public: - VersionSet(const std::string& dbname, - const Options* options, - TableCache* table_cache, - const InternalKeyComparator*); - ~VersionSet(); - - // Apply *edit to the current version to form a new descriptor that - // is both saved to persistent state and installed as the new - // current version. Iff Apply() returns OK, arrange to delete - // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed - // by older versions. - Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); - - // Recover the last saved descriptor from persistent storage. - Status Recover(); - - // Save current contents to *log - Status WriteSnapshot(log::Writer* log); - - // Return the current version. - Version* current() const { return current_; } - - // Return the current manifest file number - uint64_t ManifestFileNumber() const { return manifest_file_number_; } - - // Allocate and return a new file number - uint64_t NewFileNumber() { return next_file_number_++; } - - // Return the number of Table files at the specified level. - int NumLevelFiles(int level) const; - - // Return the combined file size of all files at the specified level. - int64_t NumLevelBytes(int level) const; - - // Return the last sequence number. - uint64_t LastSequence() const { return last_sequence_; } - - // Set the last sequence number to s. - void SetLastSequence(uint64_t s) { - assert(s >= last_sequence_); - last_sequence_ = s; - } - - // Return the current log file number. - uint64_t LogNumber() const { return log_number_; } - - // Return the log file number for the log file that is currently - // being compacted, or zero if there is no such log file. - uint64_t PrevLogNumber() const { return prev_log_number_; } - - // Pick level and inputs for a new compaction. - // Returns NULL if there is no compaction to be done. - // Otherwise returns a pointer to a heap-allocated object that - // describes the compaction. Caller should delete the result. - Compaction* PickCompaction(); - - // Return a compaction object for compacting the range [begin,end] in - // the specified level. Returns NULL if there is nothing in that - // level that overlaps the specified range. Caller should delete - // the result. - Compaction* CompactRange( - int level, - const InternalKey& begin, - const InternalKey& end); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t MaxNextLevelOverlappingBytes(); - - // Create an iterator that reads over the compaction inputs for "*c". - // The caller should delete the iterator when no longer needed. - Iterator* MakeInputIterator(Compaction* c); - - // Returns true iff some level needs a compaction. - bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } - - // Add all files listed in any live version to *live. - // May also mutate some internal state. - void AddLiveFiles(std::set* live); - - // Return the approximate offset in the database of the data for - // "key" as of version "v". - uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); - - // Register a reference to a large value with the specified - // large_ref from the specified file number. Returns "true" if this - // is the first recorded reference to the "large_ref" value in the - // database, and false otherwise. - bool RegisterLargeValueRef(const LargeValueRef& large_ref, - uint64_t filenum, - const InternalKey& internal_key); - - // Cleanup the large value reference state by eliminating any - // references from files that are not includes in either "live_tables" - // or the current log. - void CleanupLargeValueRefs(const std::set& live_tables); - - // Returns true if a large value with the given reference is live. - bool LargeValueIsLive(const LargeValueRef& large_ref); - - private: - class Builder; - - friend class Compaction; - friend class Version; - - Status Finalize(Version* v); - - // Delete any old versions that are no longer needed. - void MaybeDeleteOldVersions(); - - struct BySmallestKey; - Status SortLevel(Version* v, uint64_t level); - - void GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs); - - void GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest); - - void GetRange2(const std::vector& inputs1, - const std::vector& inputs2, - InternalKey* smallest, - InternalKey* largest); - - void SetupOtherInputs(Compaction* c); - - Env* const env_; - const std::string dbname_; - const Options* const options_; - TableCache* const table_cache_; - const InternalKeyComparator icmp_; - uint64_t next_file_number_; - uint64_t manifest_file_number_; - uint64_t last_sequence_; - uint64_t log_number_; - uint64_t prev_log_number_; // 0 or backing store for memtable being compacted - - // Opened lazily - WritableFile* descriptor_file_; - log::Writer* descriptor_log_; - - // Versions are kept in a singly linked list that is never empty - Version* current_; // Pointer to the last (newest) list entry - Version* oldest_; // Pointer to the first (oldest) list entry - - // Map from large value reference to the set of - // values containing references to the value. We keep the - // internal key as a std::string rather than as an InternalKey because - // we want to be able to easily use a set. - typedef std::set > LargeReferencesSet; - typedef std::map LargeValueMap; - LargeValueMap large_value_refs_; - - // Per-level key at which the next compaction at that level should start. - // Either an empty string, or a valid InternalKey. - std::string compact_pointer_[config::kNumLevels]; - - // No copying allowed - VersionSet(const VersionSet&); - void operator=(const VersionSet&); -}; - -// A Compaction encapsulates information about a compaction. -class Compaction { - public: - ~Compaction(); - - // Return the level that is being compacted. Inputs from "level" - // and "level+1" will be merged to produce a set of "level+1" files. - int level() const { return level_; } - - // Return the object that holds the edits to the descriptor done - // by this compaction. - VersionEdit* edit() { return &edit_; } - - // "which" must be either 0 or 1 - int num_input_files(int which) const { return inputs_[which].size(); } - - // Return the ith input file at "level()+which" ("which" must be 0 or 1). - FileMetaData* input(int which, int i) const { return inputs_[which][i]; } - - // Maximum size of files to build during this compaction. - uint64_t MaxOutputFileSize() const { return max_output_file_size_; } - - // Is this a trivial compaction that can be implemented by just - // moving a single input file to the next level (no merging or splitting) - bool IsTrivialMove() const; - - // Add all inputs to this compaction as delete operations to *edit. - void AddInputDeletions(VersionEdit* edit); - - // Returns true if the information we have available guarantees that - // the compaction is producing data in "level+1" for which no data exists - // in levels greater than "level+1". - bool IsBaseLevelForKey(const Slice& user_key); - - // Returns true iff we should stop building the current output - // before processing "key". - bool ShouldStopBefore(const InternalKey& key); - - // Release the input version for the compaction, once the compaction - // is successful. - void ReleaseInputs(); - - private: - friend class Version; - friend class VersionSet; - - explicit Compaction(int level); - - int level_; - uint64_t max_output_file_size_; - Version* input_version_; - VersionEdit edit_; - - // Each compaction reads inputs from "level_" and "level_+1" - std::vector inputs_[2]; // The two sets of inputs - - // State used to check for number of of overlapping grandparent files - // (parent == level_ + 1, grandparent == level_ + 2) - std::vector grandparents_; - int grandparent_index_; // Index in grandparent_starts_ - bool seen_key_; // Some output key has been seen - int64_t overlapped_bytes_; // Bytes of overlap between current output - // and grandparent files - - // State for implementing IsBaseLevelForKey - - // level_ptrs_ holds indices into input_version_->levels_: our state - // is that we are positioned at one of the file ranges for each - // higher level than the ones involved in this compaction (i.e. for - // all L >= level_ + 2). - int level_ptrs_[config::kNumLevels]; -}; - -} - -#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/db/write_batch.cc b/db/write_batch.cc deleted file mode 100644 index e84e548..0000000 --- a/db/write_batch.cc +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// WriteBatch::rep_ := -// sequence: fixed64 -// count: fixed32 -// data: record[count] -// record := -// kTypeValue varstring varstring | -// kTypeLargeValueRef varstring varstring | -// kTypeDeletion varstring -// varstring := -// len: varint32 -// data: uint8[len] - -#include "leveldb/write_batch.h" - -#include "leveldb/db.h" -#include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "util/coding.h" - -namespace leveldb { - -WriteBatch::WriteBatch() { - Clear(); -} - -WriteBatch::~WriteBatch() { } - -void WriteBatch::Clear() { - rep_.clear(); - rep_.resize(12); -} - -int WriteBatchInternal::Count(const WriteBatch* b) { - return DecodeFixed32(b->rep_.data() + 8); -} - -void WriteBatchInternal::SetCount(WriteBatch* b, int n) { - EncodeFixed32(&b->rep_[8], n); -} - -SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { - return SequenceNumber(DecodeFixed64(b->rep_.data())); -} - -void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { - EncodeFixed64(&b->rep_[0], seq); -} - -void WriteBatch::Put(const Slice& key, const Slice& value) { - WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeValue)); - PutLengthPrefixedSlice(&rep_, key); - PutLengthPrefixedSlice(&rep_, value); -} - -void WriteBatchInternal::PutLargeValueRef(WriteBatch* b, - const Slice& key, - const LargeValueRef& large_ref) { - WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); - b->rep_.push_back(static_cast(kTypeLargeValueRef)); - PutLengthPrefixedSlice(&b->rep_, key); - PutLengthPrefixedSlice(&b->rep_, - Slice(large_ref.data, sizeof(large_ref.data))); -} - -void WriteBatch::Delete(const Slice& key) { - WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeDeletion)); - PutLengthPrefixedSlice(&rep_, key); -} - -Status WriteBatchInternal::InsertInto(const WriteBatch* b, - MemTable* memtable) { - const int count = WriteBatchInternal::Count(b); - int found = 0; - Iterator it(*b); - for (; !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeDeletion: - memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); - break; - case kTypeValue: - memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); - break; - case kTypeLargeValueRef: - memtable->Add(it.sequence_number(), kTypeLargeValueRef, - it.key(), it.value()); - break; - } - found++; - } - if (!it.status().ok()) { - return it.status(); - } else if (found != count) { - return Status::Corruption("wrong count in WriteBatch"); - } - return Status::OK(); -} - -void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { - assert(contents.size() >= 12); - b->rep_.assign(contents.data(), contents.size()); -} - -WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) - : input_(WriteBatchInternal::Contents(&batch)), - done_(false) { - if (input_.size() < 12) { - done_ = true; - } else { - seq_ = WriteBatchInternal::Sequence(&batch), - input_.remove_prefix(12); - GetNextEntry(); - } -} - -void WriteBatchInternal::Iterator::Next() { - assert(!done_); - seq_++; - GetNextEntry(); -} - -void WriteBatchInternal::Iterator::GetNextEntry() { - if (input_.empty()) { - done_ = true; - return; - } - char tag = input_[0]; - input_.remove_prefix(1); - switch (tag) { - case kTypeValue: - case kTypeLargeValueRef: - if (GetLengthPrefixedSlice(&input_, &key_) && - GetLengthPrefixedSlice(&input_, &value_)) { - op_ = static_cast(tag); - } else { - status_ = Status::Corruption("bad WriteBatch Put"); - done_ = true; - input_.clear(); - } - break; - case kTypeDeletion: - if (GetLengthPrefixedSlice(&input_, &key_)) { - op_ = kTypeDeletion; - } else { - status_ = Status::Corruption("bad WriteBatch Delete"); - done_ = true; - input_.clear(); - } - break; - default: - status_ = Status::Corruption("unknown WriteBatch tag"); - done_ = true; - input_.clear(); - break; - } -} - -} diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h deleted file mode 100644 index ea28e2d..0000000 --- a/db/write_batch_internal.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ -#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ - -#include "leveldb/write_batch.h" - -namespace leveldb { - -// WriteBatchInternal provides static methods for manipulating a -// WriteBatch that we don't want in the public WriteBatch interface. -class WriteBatchInternal { - public: - static void PutLargeValueRef(WriteBatch* batch, - const Slice& key, - const LargeValueRef& large_ref); - - // Return the number of entries in the batch. - static int Count(const WriteBatch* batch); - - // Set the count for the number of entries in the batch. - static void SetCount(WriteBatch* batch, int n); - - // Return the seqeunce number for the start of this batch. - static SequenceNumber Sequence(const WriteBatch* batch); - - // Store the specified number as the seqeunce number for the start of - // this batch. - static void SetSequence(WriteBatch* batch, SequenceNumber seq); - - static Slice Contents(const WriteBatch* batch) { - return Slice(batch->rep_); - } - - static size_t ByteSize(const WriteBatch* batch) { - return batch->rep_.size(); - } - - static void SetContents(WriteBatch* batch, const Slice& contents); - - static Status InsertInto(const WriteBatch* batch, MemTable* memtable); - - // Iterate over the contents of a write batch. - class Iterator { - public: - explicit Iterator(const WriteBatch& batch); - bool Done() const { return done_; } - void Next(); - ValueType op() const { return op_; } - const Slice& key() const { return key_; } - const Slice& value() const { return value_; } - SequenceNumber sequence_number() const { return seq_; } - Status status() const { return status_; } - - private: - void GetNextEntry(); - - Slice input_; - bool done_; - ValueType op_; - Slice key_; - Slice value_; - SequenceNumber seq_; - Status status_; - }; -}; - -} - - -#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc deleted file mode 100644 index deb8411..0000000 --- a/db/write_batch_test.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "leveldb/env.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -static std::string PrintContents(WriteBatch* b) { - InternalKeyComparator cmp(BytewiseComparator()); - MemTable mem(cmp); - std::string state; - Status s = WriteBatchInternal::InsertInto(b, &mem); - Iterator* iter = mem.NewIterator(); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey ikey; - ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); - switch (ikey.type) { - case kTypeValue: - state.append("Put("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - break; - case kTypeLargeValueRef: - state.append("PutRef("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - break; - case kTypeDeletion: - state.append("Delete("); - state.append(ikey.user_key.ToString()); - state.append(")"); - break; - } - state.append("@"); - state.append(NumberToString(ikey.sequence)); - } - delete iter; - if (!s.ok()) { - state.append("ParseError()"); - } - return state; -} - -class WriteBatchTest { }; - -TEST(WriteBatchTest, Empty) { - WriteBatch batch; - ASSERT_EQ("", PrintContents(&batch)); - ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); -} - -TEST(WriteBatchTest, Multiple) { - WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - batch.Put(Slice("baz"), Slice("boo")); - WriteBatchInternal::SetSequence(&batch, 100); - ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); - ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); - ASSERT_EQ("Put(baz, boo)@102" - "Delete(box)@101" - "Put(foo, bar)@100", - PrintContents(&batch)); -} - -TEST(WriteBatchTest, PutIndirect) { - WriteBatch batch; - batch.Put(Slice("baz"), Slice("boo")); - LargeValueRef h; - for (int i = 0; i < LargeValueRef::ByteSize(); i++) { - h.data[i] = (i < 20) ? 'a' : 'b'; - } - WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h); - WriteBatchInternal::SetSequence(&batch, 100); - ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); - ASSERT_EQ(2, WriteBatchInternal::Count(&batch)); - ASSERT_EQ("Put(baz, boo)@100" - "PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101", - PrintContents(&batch)); -} - -TEST(WriteBatchTest, Corruption) { - WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - WriteBatchInternal::SetSequence(&batch, 200); - Slice contents = WriteBatchInternal::Contents(&batch); - WriteBatchInternal::SetContents(&batch, - Slice(contents.data(),contents.size()-1)); - ASSERT_EQ("Put(foo, bar)@200" - "ParseError()", - PrintContents(&batch)); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/doc/doc.css b/doc/doc.css deleted file mode 100644 index 700c564..0000000 --- a/doc/doc.css +++ /dev/null @@ -1,89 +0,0 @@ -body { - margin-left: 0.5in; - margin-right: 0.5in; - background: white; - color: black; -} - -h1 { - margin-left: -0.2in; - font-size: 14pt; -} -h2 { - margin-left: -0in; - font-size: 12pt; -} -h3 { - margin-left: -0in; -} -h4 { - margin-left: -0in; -} -hr { - margin-left: -0in; -} - -/* Definition lists: definition term bold */ -dt { - font-weight: bold; -} - -address { - text-align: center; -} -code,samp,var { - color: blue; -} -kbd { - color: #600000; -} -div.note p { - float: right; - width: 3in; - margin-right: 0%; - padding: 1px; - border: 2px solid #6060a0; - background-color: #fffff0; -} - -ul { - margin-top: -0em; - margin-bottom: -0em; -} - -ol { - margin-top: -0em; - margin-bottom: -0em; -} - -UL.nobullets { - list-style-type: none; - list-style-image: none; - margin-left: -1em; -} - -p { - margin: 1em 0 1em 0; - padding: 0 0 0 0; -} - -pre { - line-height: 1.3em; - padding: 0.4em 0 0.8em 0; - margin: 0 0 0 0; - border: 0 0 0 0; - color: blue; -} - -.datatable { - margin-left: auto; - margin-right: auto; - margin-top: 2em; - margin-bottom: 2em; - border: 1px solid; -} - -.datatable td,th { - padding: 0 0.5em 0 0.5em; - text-align: right; -} diff --git a/doc/impl.html b/doc/impl.html deleted file mode 100644 index b190d2c..0000000 --- a/doc/impl.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - -Leveldb file layout and compactions - - - - -

Files

- -The implementation of leveldb is similar in spirit to the -representation of a single - -Bigtable tablet (section 5.3). -However the organization of the files that make up the representation -is somewhat different and is explained below. - -

-Each database is represented by a set of file stored in a directory. -There are several different types of files as documented below: -

-

Log files

-

-A log file (*.log) stores a sequence of recent updates. Each update -is appended to the current log file. When the log file reaches a -pre-determined size (approximately 1MB by default), it is converted -to a sorted table (see below) and a new log file is created for future -updates. -

-A copy of the current log file is kept in an in-memory structure (the -memtable). This copy is consulted on every read so that read -operations reflect all logged updates. -

-

Sorted tables

-

-A sorted table (*.sst) stores a sequence of entries sorted by key. -Each entry is either a value for the key, or a deletion marker for the -key. (Deletion markers are kept around to hide obsolete values -present in older sorted tables). -

-The set of sorted tables are organized into a sequence of levels. The -sorted table generated from a log file is placed in a special young -level (also called level-0). When the number of young files exceeds a -certain threshold (currently four), all of the young files are merged -together with all of the overlapping level-1 files to produce a -sequence of new level-1 files (we create a new level-1 file for every -2MB of data.) -

-Files in the young level may contain overlapping keys. However files -in other levels have distinct non-overlapping key ranges. Consider -level number L where L >= 1. When the combined size of files in -level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, -...), one file in level-L, and all of the overlapping files in -level-(L+1) are merged to form a set of new files for level-(L+1). -These merges have the effect of gradually migrating new updates from -the young level to the largest level using only bulk reads and writes -(i.e., minimizing expensive seeks). - -

Large value files

-

-Each large value (greater than 64KB by default) is placed in a large -value file (*.val) of its own. An entry is maintained in the log -and/or sorted tables that maps from the corresponding key to the -name of this large value file. The name of the large value file -is derived from a SHA1 hash of the value and its length so that -identical values share the same file. -

-

Manifest

-

-A MANIFEST file lists the set of sorted tables that make up each -level, the corresponding key ranges, and other important metadata. -A new MANIFEST file (with a new number embedded in the file name) -is created whenever the database is reopened. The MANIFEST file is -formatted as a log, and changes made to the serving state (as files -are added or removed) are appended to this log. -

-

Current

-

-CURRENT is a simple text file that contains the name of the latest -MANIFEST file. -

-

Info logs

-

-Informational messages are printed to files named LOG and LOG.old. -

-

Others

-

-Other files used for miscellaneous purposes may also be present -(LOCK, *.dbtmp). - -

Level 0

-When the log file grows above a certain size (1MB by default): -
    -
  • Write the contents of the current memtable to an sstable -
  • Replace the current memtable by a brand new empty memtable -
  • Switch to a new log file -
  • Delete the old log file and the old memtable -
-Experimental measurements show that generating an sstable from a 1MB -log file takes ~12ms, which seems like an acceptable latency hiccup to -add infrequently to a log write. - -

-The new sstable is added to a special level-0 level. level-0 contains -a set of files (up to 4 by default). However unlike other levels, -these files do not cover disjoint ranges, but may overlap each other. - -

Compactions

- -

-When the size of level L exceeds its limit, we compact it in a -background thread. The compaction picks a file from level L and all -overlapping files from the next level L+1. Note that if a level-L -file overlaps only part of a level-(L+1) file, the entire file at -level-(L+1) is used as an input to the compaction and will be -discarded after the compaction. Aside: because level-0 is special -(files in it may overlap each other), we treat compactions from -level-0 to level-1 specially: a level-0 compaction may pick more than -one level-0 file in case some of these files overlap each other. - -

-A compaction merges the contents of the picked files to produce a -sequence of level-(L+1) files. We switch to producing a new -level-(L+1) file after the current output file has reached the target -file size (2MB). We also switch to a new output file when the key -range of the current output file has grown enough to overlap more then -ten level-(L+2) files. This last rule ensures that a later compaction -of a level-(L+1) file will not pick up too much data from level-(L+2). - -

-The old files are discarded and the new files are added to the serving -state. - -

-Compactions for a particular level rotate through the key space. In -more detail, for each level L, we remember the ending key of the last -compaction at level L. The next compaction for level L will pick the -first file that starts after this key (wrapping around to the -beginning of the key space if there is no such file). - -

-Compactions drop overwritten values. They also drop deletion markers -if there are no higher numbered levels that contain a file whose range -overlaps the current key. - -

Timing

- -Level-0 compactions will read up to four 1MB files from level-0, and -at worst all the level-1 files (10MB). I.e., we will read 14MB and -write 14MB. - -

-Other than the special level-0 compactions, we will pick one 2MB file -from level L. In the worst case, this will overlap ~ 12 files from -level L+1 (10 because level-(L+1) is ten times the size of level-L, -and another two at the boundaries since the file ranges at level-L -will usually not be aligned with the file ranges at level-L+1). The -compaction will therefore read 26MB and write 26MB. Assuming a disk -IO rate of 100MB/s (ballpark range for modern drives), the worst -compaction cost will be approximately 0.5 second. - -

-If we throttle the background writing to something small, say 10% of -the full 100MB/s speed, a compaction may take up to 5 seconds. If the -user is writing at 10MB/s, we might build up lots of level-0 files -(~50 to hold the 5*10MB). This may signficantly increase the cost of -reads due to the overhead of merging more files together on every -read. - -

-Solution 1: To reduce this problem, we might want to increase the log -switching threshold when the number of level-0 files is large. Though -the downside is that the larger this threshold, the larger the delay -that we will add to write latency when a write triggers a log switch. - -

-Solution 2: We might want to decrease write rate artificially when the -number of level-0 files goes up. - -

-Solution 3: We work on reducing the cost of very wide merges. -Perhaps most of the level-0 files will have their blocks sitting -uncompressed in the cache and we will only need to worry about the -O(N) complexity in the merging iterator. - -

Number of files

- -Instead of always making 2MB files, we could make larger files for -larger levels to reduce the total file count, though at the expense of -more bursty compactions. Alternatively, we could shard the set of -files into multiple directories. - -

-An experiment on an ext3 filesystem on Feb 04, 2011 shows -the following timings to do 100K file opens in directories with -varying number of files: - - - - - -
Files in directoryMicroseconds to open a file
10009
1000010
10000016
-So maybe even the sharding is not necessary on modern filesystems? - -

Recovery

- -
    -
  • Read CURRENT to find name of the latest committed MANIFEST -
  • Read the named MANIFEST file -
  • Clean up stale files -
  • We could open all sstables here, but it is probably better to be lazy... -
  • Convert log chunk to a new level-0 sstable -
  • Start directing new writes to a new log file with recovered sequence# -
- -

Garbage collection of files

- -DeleteObsoleteFiles() is called at the end of every -compaction and at the end of recovery. It finds the names of all -files in the database. It deletes all log files that are not the -current log file. It deletes all table files that are not referenced -from some level and are not the output of an active compaction. It -deletes all large value files that are not referenced from any live -table or log file. - - - diff --git a/doc/index.html b/doc/index.html deleted file mode 100644 index 2a83fc3..0000000 --- a/doc/index.html +++ /dev/null @@ -1,509 +0,0 @@ - - - - -Leveldb - - - -

Leveldb

-
Jeff Dean, Sanjay Ghemawat
-

-The leveldb library provides a persistent key value store. Keys and -values are arbitrary byte arrays. The keys are ordered within the key -value store according to a user-specified comparator function. - -

-

Opening A Database

-

-A leveldb database has a name which corresponds to a file system -directory. All of the contents of database are stored in this -directory. The following example shows how to open a database, -creating it if necessary: -

-

-  #include <assert>
-  #include "leveldb/include/db.h"
-
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.create_if_missing = true;
-  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-  assert(status.ok());
-  ...
-
-If you want to raise an error if the database already exists, add -the following line before the leveldb::DB::Open call: -
-  options.error_if_exists = true;
-
-

Status

-

-You may have noticed the leveldb::Status type above. Values of this -type are returned by most functions in leveldb that may encounter an -error. You can check if such a result is ok, and also print an -associated error message: -

-

-   leveldb::Status s = ...;
-   if (!s.ok()) cerr << s.ToString() << endl;
-
-

Closing A Database

-

-When you are done with a database, just delete the database object. -Example: -

-

-  ... open the db as described above ...
-  ... do something with db ...
-  delete db;
-
-

Reads And Writes

-

-The database provides Put, Delete, and Get methods to -modify/query the database. For example, the following code -moves the value stored under key1 to key2. -

-  std::string value;
-  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
-  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
-
- -

Atomic Updates

-

-Note that if the process dies after the Put of key2 but before the -delete of key1, the same value may be left stored under multiple keys. -Such problems can be avoided by using the WriteBatch class to -atomically apply a set of updates: -

-

-  #include "leveldb/include/write_batch.h"
-  ...
-  std::string value;
-  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-  if (s.ok()) {
-    leveldb::WriteBatch batch;
-    batch.Delete(key1);
-    batch.Put(key2, value);
-    s = db->Write(leveldb::WriteOptions(), &batch);
-  }
-
-The WriteBatch holds a sequence of edits to be made to the database, -and these edits within the batch are applied in order. Note that we -called Delete before Put so that if key1 is identical to key2, -we do not end up erroneously dropping the value entirely. -

-Apart from its atomicity benefits, WriteBatch may also be used to -speed up bulk updates by placing lots of individual mutations into the -same batch. - -

Synchronous Writes

-By default, each write to leveldb is asynchronous: it -returns after pushing the write from the process into the operating -system. The transfer from operating system memory to the underlying -persistent storage happens asynchronously. The sync flag -can be turned on for a particular write to make the write operation -not return until the data being written has been pushed all the way to -persistent storage. (On Posix systems, this is implemented by calling -either fsync(...) or fdatasync(...) or -msync(..., MS_SYNC) before the write operation returns.) -
-  leveldb::WriteOptions write_options;
-  write_options.sync = true;
-  db->Put(write_options, ...);
-
-Asynchronous writes are often more than a thousand times as fast as -synchronous writes. The downside of asynchronous writes is that a -crash of the machine may cause the last few updates to be lost. Note -that a crash of just the writing process (i.e., not a reboot) will not -cause any loss since even when sync is false, an update -is pushed from the process memory into the operating system before it -is considered done. - -

-Asynchronous writes can often be used safely. For example, when -loading a large amount of data into the database you can handle lost -updates by restarting the bulk load after a crash. A hybrid scheme is -also possible where every Nth write is synchronous, and in the event -of a crash, the bulk load is restarted just after the last synchronous -write finished by the previous run. (The synchronous write can update -a marker that describes where to restart on a crash.) - -

-WriteBatch provides an alternative to asynchronous writes. -Multiple updates may be placed in the same WriteBatch and -applied together using a synchronous write (i.e., -write_options.sync is set to true). The extra cost of -the synchronous write will be amortized across all of the writes in -the batch. - -

-

Concurrency

-

-A database may only be opened by one process at a time. The leveldb -implementation acquires a lock from the operating system to prevent -misuse. Within a single process, the same leveldb::DB object may -be safely used by multiple concurrent threads. -

-

Iteration

-

-The following example demonstrates how to print all key,value pairs -in a database. -

-

-  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
-  }
-  assert(it->status().ok());  // Check for any errors found during the scan
-  delete it;
-
-The following variation shows how to process just the keys in the -range [start,limit): -

-

-  for (it->Seek(start);
-       it->Valid() && it->key().ToString() < limit;
-       it->Next()) {
-    ...
-  }
-
-You can also process entries in reverse order. (Caveat: reverse -iteration may be somewhat slower than forward iteration.) -

-

-  for (it->SeekToLast(); it->Valid(); it->Prev()) {
-    ...
-  }
-
-

Snapshots

-

-Snapshots provide consistent read-only views over the entire state of -the key-value store. ReadOptions::snapshot may be non-NULL to indicate -that a read should operate on a particular version of the DB state. -If ReadOptions::snapshot is NULL, the read will operate on an -implicit snapshot of the current state. -

-Snapshots typically are created by the DB::GetSnapshot() method: -

-

-  leveldb::ReadOptions options;
-  options.snapshot = db->GetSnapshot();
-  ... apply some updates to db ...
-  leveldb::Iterator* iter = db->NewIterator(options);
-  ... read using iter to view the state when the snapshot was created ...
-  delete iter;
-  db->ReleaseSnapshot(options.snapshot);
-
-Note that when a snapshot is no longer needed, it should be released -using the DB::ReleaseSnapshot interface. This allows the -implementation to get rid of state that was being maintained just to -support reading as of that snapshot. -

-A Write operation can also return a snapshot that -represents the state of the database just after applying a particular -set of updates: -

-

-  leveldb::Snapshot* snapshot;
-  leveldb::WriteOptions write_options;
-  write_options.post_write_snapshot = &snapshot;
-  leveldb::Status status = db->Write(write_options, ...);
-  ... perform other mutations to db ...
-
-  leveldb::ReadOptions read_options;
-  read_options.snapshot = snapshot;
-  leveldb::Iterator* iter = db->NewIterator(read_options);
-  ... read as of the state just after the Write call returned ...
-  delete iter;
-
-  db->ReleaseSnapshot(snapshot);
-
-

Slice

-

-The return value of the it->key() and it->value() calls above -are instances of the leveldb::Slice type. Slice is a simple -structure that contains a length and a pointer to an external byte -array. Returning a Slice is a cheaper alternative to returning a -std::string since we do not need to copy potentially large keys and -values. In addition, leveldb methods do not return null-terminated -C-style strings since leveldb keys and values are allowed to -contain '\0' bytes. -

-C++ strings and null-terminated C-style strings can be easily converted -to a Slice: -

-

-   leveldb::Slice s1 = "hello";
-
-   std::string str("world");
-   leveldb::Slice s2 = str;
-
-A Slice can be easily converted back to a C++ string: -
-   std::string str = s1.ToString();
-   assert(str == std::string("hello"));
-
-Be careful when using Slices since it is up to the caller to ensure that -the external byte array into which the Slice points remains live while -the Slice is in use. For example, the following is buggy: -

-

-   leveldb::Slice slice;
-   if (...) {
-     std::string str = ...;
-     slice = str;
-   }
-   Use(slice);
-
-When the if statement goes out of scope, str will be destroyed and the -backing storage for slice will disappear. -

-

Comparators

-

-The preceding examples used the default ordering function for key, -which orders bytes lexicographically. You can however supply a custom -comparator when opening a database. For example, suppose each -database key consists of two numbers and we should sort by the first -number, breaking ties by the second number. First, define a proper -subclass of leveldb::Comparator that expresses these rules: -

-

-  class TwoPartComparator : public leveldb::Comparator {
-   public:
-    // Three-way comparison function:
-    //   if a < b: negative result
-    //   if a > b: positive result
-    //   else: zero result
-    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
-      int a1, a2, b1, b2;
-      ParseKey(a, &a1, &a2);
-      ParseKey(b, &b1, &b2);
-      if (a1 < b1) return -1;
-      if (a1 > b1) return +1;
-      if (a2 < b2) return -1;
-      if (a2 > b2) return +1;
-      return 0;
-    }
-
-    // Ignore the following methods for now:
-    const char* Name() { return "TwoPartComparator"; }
-    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
-    void FindShortSuccessor(std::string*) const { }
-  };
-
-Now create a database using this custom comparator: -

-

-  TwoPartComparator cmp;
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.create_if_missing = true;
-  options.comparator = &cmp;
-  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-  ...
-
-

Backwards compatibility

-

-The result of the comparator's Name method is attached to the -database when it is created, and is checked on every subsequent -database open. If the name changes, the leveldb::DB::Open call will -fail. Therefore, change the name if and only if the new key format -and comparison function are incompatible with existing databases, and -it is ok to discard the contents of all existing databases. -

-You can however still gradually evolve your key format over time with -a little bit of pre-planning. For example, you could store a version -number at the end of each key (one byte should suffice for most uses). -When you wish to switch to a new key format (e.g., adding an optional -third part to the keys processed by TwoPartComparator), -(a) keep the same comparator name (b) increment the version number -for new keys (c) change the comparator function so it uses the -version numbers found in the keys to decide how to interpret them. -

-

Performance

-

-Performance can be tuned by changing the default values of the -types defined in leveldb/include/options.h. - -

-

Block size

-

-leveldb groups adjacent keys together into the same block and such a -block is the unit of transfer to and from persistent storage. The -default block size is approximately 4096 uncompressed bytes. -Applications that mostly do bulk scans over the contents of the -database may wish to increase this size. Applications that do a lot -of point reads of small values may wish to switch to a smaller block -size if performance measurements indicate an improvement. There isn't -much benefit in using blocks smaller than one kilobyte, or larger than -a few megabytes. Also note that compression will be more effective -with larger block sizes. -

-

Compression

-

-Each block is individually compressed before being written to -persistent storage. Compression is on by default since the default -compression method is very fast, and is automatically disabled for -uncompressible data. In rare cases, applications may want to disable -compression entirely, but should only do so if benchmarks show a -performance improvement: -

-

-  leveldb::Options options;
-  options.compression = leveldb::kNoCompression;
-  ... leveldb::DB::Open(options, name, ...) ....
-
-

Cache

-

-The contents of the database are stored in a set of files in the -filesystem and each file stores a sequence of compressed blocks. If -options.cache is non-NULL, it is used to cache frequently used -uncompressed block contents. -

-

-  #include "leveldb/include/cache.h"
-
-  leveldb::Options options;
-  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
-  leveldb::DB* db;
-  leveldb::DB::Open(options, name, &db);
-  ... use the db ...
-  delete db
-  delete options.cache;
-
-Note that the cache holds uncompressed data, and therefore it should -be sized according to application level data sizes, without any -reduction from compression. (Caching of compressed blocks is left to -the operating system buffer cache, or any custom Env -implementation provided by the client.) -

-When performing a bulk read, the application may wish to disable -caching so that the data processed by the bulk read does not end up -displacing most of the cached contents. A per-iterator option can be -used to achieve this: -

-

-  leveldb::ReadOptions options;
-  options.fill_cache = false;
-  leveldb::Iterator* it = db->NewIterator(options);
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    ...
-  }
-
-

Key Layout

-

-Note that the unit of disk transfer and caching is a block. Adjacent -keys (according to the database sort order) will usually be placed in -the same block. Therefore the application can improve its performance -by placing keys that are accessed together near each other and placing -infrequently used keys in a separate region of the key space. -

-For example, suppose we are implementing a simple file system on top -of leveldb. The types of entries we might wish to store are: -

-

-   filename -> permission-bits, length, list of file_block_ids
-   file_block_id -> data
-
-We might want to prefix filename keys with one letter (say '/') and the -file_block_id keys with a different letter (say '0') so that scans -over just the metadata do not force us to fetch and cache bulky file -contents. -

-

Large Values

-

-leveldb has special treatment of large values (by default, a value -of length greater than or equal to 64K is considered large, though a -field in Options can be used to adjust this threshold). Each such -large value is placed in a separate operating system file, and the -normal database blocks just contain pointers to such files. -

-Furthermore, if the same large value occurs multiple times in a single -database, it will be stored just once. -

-

Checksums

-

-leveldb associates checksums with all data it stores in the file system. -There are two separate controls provided over how aggressively these -checksums are verified: -

-

    -
  • ReadOptions::verify_checksums may be set to true to force - checksum verification of all data that is read from the file system on - behalf of a particular read. By default, no such verification is - done. -

    -

  • Options::paranoid_checks may be set to true before opening a - database to make the database implementation raise an error as soon as - it detects an internal corruption. Depending on which portion of the - database has been corrupted, the error may be raised when the database - is opened, or later by another database operation. By default, - paranoid checking is off so that the database can be used even if - parts of its persistent storage have been corrupted. -

    - If a database is corrupted (perhaps it cannot be opened when - paranoid checking is turned on), the leveldb::RepairDB function - may be used to recover as much of the data as possible -

    -

-

Approximate Sizes

-

-The GetApproximateSizes method can used to get the approximate -number of bytes of file system space used by one or more key ranges. -

-

-   leveldb::Range ranges[2];
-   ranges[0] = leveldb::Range("a", "c");
-   ranges[1] = leveldb::Range("x", "z");
-   uint64_t sizes[2];
-   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
-
-The preceding call will set sizes[0] to the approximate number of -bytes of file system space used by the key range [a..c) and -sizes[1] to the approximate number of bytes used by the key range -[x..z). -

-

Environment

-

-All file operations (and other operating system calls) issued by the -leveldb implementation are routed through a leveldb::Env object. -Sophisticated clients may wish to provide their own Env -implementation to get better control. For example, an application may -introduce artificial delays in the file IO paths to limit the impact -of leveldb on other activities in the system. -

-

-  class SlowEnv : public leveldb::Env {
-    .. implementation of the Env interface ...
-  };
-
-  SlowEnv env;
-  leveldb::Options options;
-  options.env = &env;
-  Status s = leveldb::DB::Open(options, ...);
-
-

Porting

-

-leveldb may be ported to a new platform by providing platform -specific implementations of the types/methods/functions exported by -leveldb/port/port.h. See leveldb/port/port_example.h for more -details. -

-In addition, the new platform may need a new default leveldb::Env -implementation. See leveldb/util/env_posix.h for an example. - -

Other Information

- -

-Details about the leveldb implementation may be found in -the following documents: -

- - - diff --git a/doc/log_format.txt b/doc/log_format.txt deleted file mode 100644 index 3a0414b..0000000 --- a/doc/log_format.txt +++ /dev/null @@ -1,75 +0,0 @@ -The log file contents are a sequence of 32KB blocks. The only -exception is that the tail of the file may contain a partial block. - -Each block consists of a sequence of records: - block := record* trailer? - record := - checksum: uint32 // crc32c of type and data[] - length: uint16 - type: uint8 // One of FULL, FIRST, MIDDLE, LAST - data: uint8[length] - -A record never starts within the last six bytes of a block (since it -won't fit). Any leftover bytes here form the trailer, which must -consist entirely of zero bytes and must be skipped by readers. - -Aside: if exactly seven bytes are left in the current block, and a new -non-zero length record is added, the writer must emit a FIRST record -(which contains zero bytes of user data) to fill up the trailing seven -bytes of the block and then emit all of the user data in subsequent -blocks. - -More types may be added in the future. Some Readers may skip record -types they do not understand, others may report that some data was -skipped. - -FULL == 1 -FIRST == 2 -MIDDLE == 3 -LAST == 4 - -The FULL record contains the contents of an entire user record. - -FIRST, MIDDLE, LAST are types used for user records that have been -split into multiple fragments (typically because of block boundaries). -FIRST is the type of the first fragment of a user record, LAST is the -type of the last fragment of a user record, and MID is the type of all -interior fragments of a user record. - -Example: consider a sequence of user records: - A: length 1000 - B: length 97270 - C: length 8000 -A will be stored as a FULL record in the first block. - -B will be split into three fragments: first fragment occupies the rest -of the first block, second fragment occupies the entirety of the -second block, and the third fragment occupies a prefix of the third -block. This will leave six bytes free in the third block, which will -be left empty as the trailer. - -C will be stored as a FULL record in the fourth block. - -=================== - -Some benefits over the recordio format: - -(1) We do not need any heuristics for resyncing - just go to next -block boundary and scan. If there is a corruption, skip to the next -block. As a side-benefit, we do not get confused when part of the -contents of one log file are embedded as a record inside another log -file. - -(2) Splitting at approximate boundaries (e.g., for mapreduce) is -simple: find the next block boundary and skip records until we -hit a FULL or FIRST record. - -(3) We do not need extra buffering for large records. - -Some downsides compared to recordio format: - -(1) No packing of tiny records. This could be fixed by adding a new -record type, so it is a shortcoming of the current implementation, -not necessarily the format. - -(2) No compression. Again, this could be fixed by adding new record types. diff --git a/doc/table_format.txt b/doc/table_format.txt deleted file mode 100644 index ad5aa4b..0000000 --- a/doc/table_format.txt +++ /dev/null @@ -1,61 +0,0 @@ -File format -=========== - - - [data block 1] - [data block 2] - ... - [data block N] - [meta block 1] - ... - [meta block K] - [metaindex block] - [index block] - [Footer] (fixed size; starts at file_size - sizeof(Footer)) - - -The file contains internal pointers. Each such pointer is called -a BlockHandle and contains the following information: - offset: varint64 - size: varint64 - -(1) The sequence of key/value pairs in the file are stored in sorted -order and partitioned into a sequence of data blocks. These blocks -come one after another at the beginning of the file. Each data block -is formatted according to the code in block_builder.cc, and then -optionally compressed. - -(2) After the data blocks we store a bunch of meta blocks. The -supported meta block types are described below. More meta block types -may be added in the future. Each meta block is again formatted using -block_builder.cc and then optionally compressed. - -(3) A "metaindex" block. It contains one entry for every other meta -block where the key is the name of the meta block and the value is a -BlockHandle pointing to that meta block. - -(4) An "index" block. This block contains one entry per data block, -where the key is a string >= last key in that data block and before -the first key in the successive data block. The value is the -BlockHandle for the data block. - -(6) At the very end of the file is a fixed length footer that contains -the BlockHandle of the metaindex and index blocks as well as a magic number. - metaindex_handle: char[p]; // Block handle for metaindex - index_handle: char[q]; // Block handle for index - padding: char[40-p-q]; // 0 bytes to make fixed length - // (40==2*BlockHandle::kMaxEncodedLength) - magic: fixed64; // == 0xdb4775248b80fb57 - -"stats" Meta Block ------------------- - -This meta block contains a bunch of stats. The key is the name -of the statistic. The value contains the statistic. -TODO(postrelease): record following stats. - data size - index size - key size (uncompressed) - value size (uncompressed) - number of entries - number of data blocks diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h deleted file mode 100644 index 79196d1..0000000 --- a/include/leveldb/cache.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Cache is an interface that maps keys to values. It has internal -// synchronization and may be safely accessed concurrently from -// multiple threads. It may automatically evict entries to make room -// for new entries. Values have a specified charge against the cache -// capacity. For example, a cache where the values are variable -// length strings, may use the length of the string as the charge for -// the string. -// -// A builtin cache implementation with a least-recently-used eviction -// policy is provided. Clients may use their own implementations if -// they want something more sophisticated (like scan-resistance, a -// custom eviction policy, variable cache sizing, etc.) - -#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ -#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ - -#include -#include "leveldb/slice.h" - -namespace leveldb { - -class Cache; - -// Create a new cache with a fixed size capacity. This implementation -// of Cache uses a least-recently-used eviction policy. -extern Cache* NewLRUCache(size_t capacity); - -class Cache { - public: - Cache() { } - - // Destroys all existing entries by calling the "deleter" - // function that was passed to the constructor. - virtual ~Cache(); - - // Opaque handle to an entry stored in the cache. - struct Handle { }; - - // Insert a mapping from key->value into the cache and assign it - // the specified charge against the total cache capacity. - // - // Returns a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. - // - // When the inserted entry is no longer needed, the key and - // value will be passed to "deleter". - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) = 0; - - // If the cache has no mapping for "key", returns NULL. - // - // Else return a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. - virtual Handle* Lookup(const Slice& key) = 0; - - // Release a mapping returned by a previous Lookup(). - // REQUIRES: handle must not have been released yet. - // REQUIRES: handle must have been returned by a method on *this. - virtual void Release(Handle* handle) = 0; - - // Return the value encapsulated in a handle returned by a - // successful Lookup(). - // REQUIRES: handle must not have been released yet. - // REQUIRES: handle must have been returned by a method on *this. - virtual void* Value(Handle* handle) = 0; - - // If the cache contains entry for key, erase it. Note that the - // underlying entry will be kept around until all existing handles - // to it have been released. - virtual void Erase(const Slice& key) = 0; - - // Return a new numeric id. May be used by multiple clients who are - // sharing the same cache to partition the key space. Typically the - // client will allocate a new id at startup and prepend the id to - // its cache keys. - virtual uint64_t NewId() = 0; - - private: - void LRU_Remove(Handle* e); - void LRU_Append(Handle* e); - void Unref(Handle* e); - - struct Rep; - Rep* rep_; - - // No copying allowed - Cache(const Cache&); - void operator=(const Cache&); -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h deleted file mode 100644 index 4e00e4d..0000000 --- a/include/leveldb/comparator.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ -#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ - -#include - -namespace leveldb { - -class Slice; - -// A Comparator object provides a total order across slices that are -// used as keys in an sstable or a database. -class Comparator { - public: - virtual ~Comparator(); - - // Three-way comparison. Returns value: - // < 0 iff "a" < "b", - // == 0 iff "a" == "b", - // > 0 iff "a" > "b" - virtual int Compare(const Slice& a, const Slice& b) const = 0; - - // The name of the comparator. Used to check for comparator - // mismatches (i.e., a DB created with one comparator is - // accessed using a different comparator. - // - // The client of this package should switch to a new name whenever - // the comparator implementation changes in a way that will cause - // the relative ordering of any two keys to change. - // - // Names starting with "leveldb." are reserved and should not be used - // by any clients of this package. - virtual const char* Name() const = 0; - - // Advanced functions: these are used to reduce the space requirements - // for internal data structures like index blocks. - - // If *start < limit, changes *start to a short string in [start,limit). - // Simple comparator implementations may return with *start unchanged, - // i.e., an implementation of this method that does nothing is correct. - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const = 0; - - // Changes *key to a short string >= *key. - // Simple comparator implementations may return with *key unchanged, - // i.e., an implementation of this method that does nothing is correct. - virtual void FindShortSuccessor(std::string* key) const = 0; -}; - -// Return a builtin comparator that uses lexicographic byte-wise -// ordering. The result remains the property of this module and -// must not be deleted. -extern const Comparator* BytewiseComparator(); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/include/leveldb/db.h b/include/leveldb/db.h deleted file mode 100644 index f18ded3..0000000 --- a/include/leveldb/db.h +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ -#define STORAGE_LEVELDB_INCLUDE_DB_H_ - -#include -#include -#include "leveldb/iterator.h" -#include "leveldb/options.h" - -namespace leveldb { - -static const int kMajorVersion = 1; -static const int kMinorVersion = 1; - -struct Options; -struct ReadOptions; -struct WriteOptions; - -class Snapshot; -class WriteBatch; - -// Some internal types. Clients should ignore. -class WriteBatchInternal; - -struct Range { - Slice start; - Slice limit; - - Range(const Slice& s, const Slice& l) : start(s), limit(l) { } -}; - -// A DB is a persistent ordered map from keys to values. -class DB { - public: - // Open the database with the specified "name". - // Stores a pointer to a heap-allocated database in *dbptr and returns - // OK on success. - // Stores NULL in *dbptr and returns a non-OK status on error. - // Caller should delete *dbptr when it is no longer needed. - static Status Open(const Options& options, - const std::string& name, - DB** dbptr); - - DB() { } - virtual ~DB(); - - // Set the database entry for "key" to "value". Returns OK on success, - // and a non-OK status on error. - // Note: consider setting options.sync = true. - virtual Status Put(const WriteOptions& options, - const Slice& key, - const Slice& value) = 0; - - // Remove the database entry (if any) for "key". Returns OK on - // success, and a non-OK status on error. It is not an error if "key" - // did not exist in the database. - // Note: consider setting options.sync = true. - virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; - - // Apply the specified updates to the database. - // Returns OK on success, non-OK on failure. - // Note: consider setting options.sync = true. - virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; - - // If the database contains an entry for "key" store the - // corresponding value in *value and return OK. - // - // If there is no entry for "key" leave *value unchanged and return - // a status for which Status::IsNotFound() returns true. - // - // May return some other Status on an error. - virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) = 0; - - // Return a heap-allocated iterator over the contents of the database. - // The result of NewIterator() is initially invalid (caller must - // call one of the Seek methods on the iterator before using it). - // - // Caller should delete the iterator when it is no longer needed. - // The returned iterator should be deleted before this db is deleted. - virtual Iterator* NewIterator(const ReadOptions& options) = 0; - - // Return a handle to the current DB state. Iterators created with - // this handle will all observe a stable snapshot of the current DB - // state. The caller must call ReleaseSnapshot(result) when the - // snapshot is no longer needed. - virtual const Snapshot* GetSnapshot() = 0; - - // Release a previously acquired snapshot. The caller must not - // use "snapshot" after this call. - virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; - - // DB implementations can export properties about their state - // via this method. If "property" is a valid property understood by this - // DB implementation, fills "*value" with its current value and returns - // true. Otherwise returns false. - // - // - // Valid property names include: - // - // "leveldb.num-files-at-level" - return the number of files at level , - // where is an ASCII representation of a level number (e.g. "0"). - // "leveldb.stats" - returns a multi-line string that describes statistics - // about the internal operation of the DB. - virtual bool GetProperty(const Slice& property, std::string* value) = 0; - - // For each i in [0,n-1], store in "sizes[i]", the approximate - // file system space used by keys in "[range[i].start .. range[i].limit)". - // - // Note that the returned sizes measure file system space usage, so - // if the user data compresses by a factor of ten, the returned - // sizes will be one-tenth the size of the corresponding user data size. - // - // The results may not include the sizes of recently written data. - virtual void GetApproximateSizes(const Range* range, int n, - uint64_t* sizes) = 0; - - // Possible extensions: - // (1) Add a method to compact a range of keys - - private: - // No copying allowed - DB(const DB&); - void operator=(const DB&); -}; - -// Destroy the contents of the specified database. -// Be very careful using this method. -Status DestroyDB(const std::string& name, const Options& options); - -// If a DB cannot be opened, you may attempt to call this method to -// resurrect as much of the contents of the database as possible. -// Some data may be lost, so be careful when calling this function -// on a database that contains important information. -Status RepairDB(const std::string& dbname, const Options& options); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/include/leveldb/env.h b/include/leveldb/env.h deleted file mode 100644 index 4b6e712..0000000 --- a/include/leveldb/env.h +++ /dev/null @@ -1,290 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// An Env is an interface used by the leveldb implementation to access -// operating system functionality like the filesystem etc. Callers -// may wish to provide a custom Env object when opening a database to -// get fine gain control; e.g., to rate limit file system operations. - -#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ -#define STORAGE_LEVELDB_INCLUDE_ENV_H_ - -#include -#include -#include -#include -#include "leveldb/status.h" - -namespace leveldb { - -class FileLock; -class RandomAccessFile; -class SequentialFile; -class Slice; -class WritableFile; - -class Env { - public: - Env() { } - virtual ~Env(); - - // Return a default environment suitable for the current operating - // system. Sophisticated users may wish to provide their own Env - // implementation instead of relying on this default environment. - // - // The result of Default() belongs to leveldb and must never be deleted. - static Env* Default(); - - // Create a brand new sequentially-readable file with the specified name. - // On success, stores a pointer to the new file in *result and returns OK. - // On failure stores NULL in *result and returns non-OK. If the file does - // not exist, returns a non-OK status. - // - // The returned file will only be accessed by one thread at a time. - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) = 0; - - // Create a brand new random access read-only file with the - // specified name. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores NULL in *result and - // returns non-OK. If the file does not exist, returns a non-OK - // status. - // - // The returned file may be concurrently accessed by multiple threads. - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) = 0; - - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores NULL in *result and - // returns non-OK. - // - // The returned file will only be accessed by one thread at a time. - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) = 0; - - // Returns true iff the named file exists. - virtual bool FileExists(const std::string& fname) = 0; - - // Store in *result the names of the children of the specified directory. - // The names are relative to "dir". - // Original contents of *results are dropped. - virtual Status GetChildren(const std::string& dir, - std::vector* result) = 0; - - // Delete the named file. - virtual Status DeleteFile(const std::string& fname) = 0; - - // Create the specified directory. - virtual Status CreateDir(const std::string& dirname) = 0; - - // Delete the specified directory. - virtual Status DeleteDir(const std::string& dirname) = 0; - - // Store the size of fname in *file_size. - virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; - - // Rename file src to target. - virtual Status RenameFile(const std::string& src, - const std::string& target) = 0; - - // Lock the specified file. Used to prevent concurrent access to - // the same db by multiple processes. On failure, stores NULL in - // *lock and returns non-OK. - // - // On success, stores a pointer to the object that represents the - // acquired lock in *lock and returns OK. The caller should call - // UnlockFile(*lock) to release the lock. If the process exits, - // the lock will be automatically released. - // - // If somebody else already holds the lock, finishes immediately - // with a failure. I.e., this call does not wait for existing locks - // to go away. - // - // May create the named file if it does not already exist. - virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; - - // Release the lock acquired by a previous successful call to LockFile. - // REQUIRES: lock was returned by a successful LockFile() call - // REQUIRES: lock has not already been unlocked. - virtual Status UnlockFile(FileLock* lock) = 0; - - // Arrange to run "(*function)(arg)" once in a background thread. - // - // "function" may run in an unspecified thread. Multiple functions - // added to the same Env may run concurrently in different threads. - // I.e., the caller may not assume that background work items are - // serialized. - virtual void Schedule( - void (*function)(void* arg), - void* arg) = 0; - - // Start a new thread, invoking "function(arg)" within the new thread. - // When "function(arg)" returns, the thread will be destroyed. - virtual void StartThread(void (*function)(void* arg), void* arg) = 0; - - // *path is set to a temporary directory that can be used for testing. It may - // or many not have just been created. The directory may or may not differ - // between runs of the same process, but subsequent calls will return the - // same directory. - virtual Status GetTestDirectory(std::string* path) = 0; - - // Write an entry to the log file with the specified format. - virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; - - // Returns the number of micro-seconds since some fixed point in time. Only - // useful for computing deltas of time. - virtual uint64_t NowMicros() = 0; - - // Sleep/delay the thread for the perscribed number of micro-seconds. - virtual void SleepForMicroseconds(int micros) = 0; - - private: - // No copying allowed - Env(const Env&); - void operator=(const Env&); -}; - -// A file abstraction for reading sequentially through a file -class SequentialFile { - public: - SequentialFile() { } - virtual ~SequentialFile(); - - // Read up to "n" bytes from the file. "scratch[0..n-1]" may be - // written by this routine. Sets "*result" to the data that was - // read (including if fewer than "n" bytes were successfully read). - // If an error was encountered, returns a non-OK status. - // - // REQUIRES: External synchronization - virtual Status Read(size_t n, Slice* result, char* scratch) = 0; -}; - -// A file abstraction for randomly reading the contents of a file. -class RandomAccessFile { - public: - RandomAccessFile() { } - virtual ~RandomAccessFile(); - - // Read up to "n" bytes from the file starting at "offset". - // "scratch[0..n-1]" may be written by this routine. Sets "*result" - // to the data that was read (including if fewer than "n" bytes were - // successfully read). If an error was encountered, returns a - // non-OK status. - // - // Safe for concurrent use by multiple threads. - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const = 0; -}; - -// A file abstraction for sequential writing. The implementation -// must provide buffering since callers may append small fragments -// at a time to the file. -class WritableFile { - public: - WritableFile() { } - virtual ~WritableFile(); - - virtual Status Append(const Slice& data) = 0; - virtual Status Close() = 0; - virtual Status Flush() = 0; - virtual Status Sync() = 0; - - private: - // No copying allowed - WritableFile(const WritableFile&); - void operator=(const WritableFile&); -}; - -// Identifies a locked file. -class FileLock { - public: - FileLock() { } - virtual ~FileLock(); - private: - // No copying allowed - FileLock(const FileLock&); - void operator=(const FileLock&); -}; - -// Log the specified data to *info_log if info_log is non-NULL. -extern void Log(Env* env, WritableFile* info_log, const char* format, ...) -# if defined(__GNUC__) || defined(__clang__) - __attribute__((__format__ (__printf__, 3, 4))) -# endif - ; - -// A utility routine: write "data" to the named file. -extern Status WriteStringToFile(Env* env, const Slice& data, - const std::string& fname); - -// A utility routine: read contents of named file into *data -extern Status ReadFileToString(Env* env, const std::string& fname, - std::string* data); - -// An implementation of Env that forwards all calls to another Env. -// May be useful to clients who wish to override just part of the -// functionality of another Env. -class EnvWrapper : public Env { - public: - // Initialize an EnvWrapper that delegates all calls to *target - explicit EnvWrapper(Env* target) : target_(target) { } - virtual ~EnvWrapper(); - - // Return the target to which this Env forwards all calls - Env* target() const { return target_; } - - // The following text is boilerplate that forwards all methods to target() - Status NewSequentialFile(const std::string& f, SequentialFile** r) { - return target_->NewSequentialFile(f, r); - } - Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { - return target_->NewRandomAccessFile(f, r); - } - Status NewWritableFile(const std::string& f, WritableFile** r) { - return target_->NewWritableFile(f, r); - } - bool FileExists(const std::string& f) { return target_->FileExists(f); } - Status GetChildren(const std::string& dir, std::vector* r) { - return target_->GetChildren(dir, r); - } - Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } - Status CreateDir(const std::string& d) { return target_->CreateDir(d); } - Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } - Status GetFileSize(const std::string& f, uint64_t* s) { - return target_->GetFileSize(f, s); - } - Status RenameFile(const std::string& s, const std::string& t) { - return target_->RenameFile(s, t); - } - Status LockFile(const std::string& f, FileLock** l) { - return target_->LockFile(f, l); - } - Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } - void Schedule(void (*f)(void*), void* a) { - return target_->Schedule(f, a); - } - void StartThread(void (*f)(void*), void* a) { - return target_->StartThread(f, a); - } - virtual Status GetTestDirectory(std::string* path) { - return target_->GetTestDirectory(path); - } - virtual void Logv(WritableFile* log, const char* format, va_list ap) { - return target_->Logv(log, format, ap); - } - uint64_t NowMicros() { - return target_->NowMicros(); - } - void SleepForMicroseconds(int micros) { - target_->SleepForMicroseconds(micros); - } - private: - Env* target_; -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h deleted file mode 100644 index 1866fb5..0000000 --- a/include/leveldb/iterator.h +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// An iterator yields a sequence of key/value pairs from a source. -// The following class defines the interface. Multiple implementations -// are provided by this library. In particular, iterators are provided -// to access the contents of a Table or a DB. - -#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ -#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ - -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class Iterator { - public: - Iterator(); - virtual ~Iterator(); - - // An iterator is either positioned at a key/value pair, or - // not valid. This method returns true iff the iterator is valid. - virtual bool Valid() const = 0; - - // Position at the first key in the source. The iterator is Valid() - // after this call iff the source is not empty. - virtual void SeekToFirst() = 0; - - // Position at the last key in the source. The iterator is - // Valid() after this call iff the source is not empty. - virtual void SeekToLast() = 0; - - // Position at the first key in the source that at or past target - // The iterator is Valid() after this call iff the source contains - // an entry that comes at or past target. - virtual void Seek(const Slice& target) = 0; - - // Moves to the next entry in the source. After this call, Valid() is - // true iff the iterator was not positioned at the last entry in the source. - // REQUIRES: Valid() - virtual void Next() = 0; - - // Moves to the previous entry in the source. After this call, Valid() is - // true iff the iterator was not positioned at the first entry in source. - // REQUIRES: Valid() - virtual void Prev() = 0; - - // Return the key for the current entry. The underlying storage for - // the returned slice is valid only until the next modification of - // the iterator. - // REQUIRES: Valid() - virtual Slice key() const = 0; - - // Return the value for the current entry. The underlying storage for - // the returned slice is valid only until the next modification of - // the iterator. - // REQUIRES: !AtEnd() && !AtStart() - virtual Slice value() const = 0; - - // If an error has occurred, return it. Else return an ok status. - virtual Status status() const = 0; - - // Clients are allowed to register function/arg1/arg2 triples that - // will be invoked when this iterator is destroyed. - // - // Note that unlike all of the preceding methods, this method is - // not abstract and therefore clients should not override it. - typedef void (*CleanupFunction)(void* arg1, void* arg2); - void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); - - private: - struct Cleanup { - CleanupFunction function; - void* arg1; - void* arg2; - Cleanup* next; - }; - Cleanup cleanup_; - - // No copying allowed - Iterator(const Iterator&); - void operator=(const Iterator&); -}; - -// Return an empty iterator (yields nothing). -extern Iterator* NewEmptyIterator(); - -// Return an empty iterator with the specified status. -extern Iterator* NewErrorIterator(const Status& status); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/include/leveldb/options.h b/include/leveldb/options.h deleted file mode 100644 index 87d388e..0000000 --- a/include/leveldb/options.h +++ /dev/null @@ -1,208 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ -#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ - -#include - -namespace leveldb { - -class Cache; -class Comparator; -class Env; -class Snapshot; -class WritableFile; - -// DB contents are stored in a set of blocks, each of which holds a -// sequence of key,value pairs. Each block may be compressed before -// being stored in a file. The following enum describes which -// compression method (if any) is used to compress a block. -enum CompressionType { - // NOTE: do not change the values of existing entries, as these are - // part of the persistent format on disk. - kNoCompression = 0x0, - kSnappyCompression = 0x1, -}; - -// Options to control the behavior of a database (passed to DB::Open) -struct Options { - // ------------------- - // Parameters that affect behavior - - // Comparator used to define the order of keys in the table. - // Default: a comparator that uses lexicographic byte-wise ordering - // - // REQUIRES: The client must ensure that the comparator supplied - // here has the same name and orders keys *exactly* the same as the - // comparator provided to previous open calls on the same DB. - const Comparator* comparator; - - // If true, the database will be created if it is missing. - // Default: false - bool create_if_missing; - - // If true, an error is raised if the database already exists. - // Default: false - bool error_if_exists; - - // If true, the implementation will do aggressive checking of the - // data it is processing and will stop early if it detects any - // errors. This may have unforeseen ramifications: for example, a - // corruption of one DB entry may cause a large number of entries to - // become unreadable or for the entire DB to become unopenable. - // Default: false - bool paranoid_checks; - - // Use the specified object to interact with the environment, - // e.g. to read/write files, schedule background work, etc. - // Default: Env::Default() - Env* env; - - // Any internal progress/error information generated by the db will - // be to written to info_log if it is non-NULL, or to a file stored - // in the same directory as the DB contents if info_log is NULL. - // Default: NULL - WritableFile* info_log; - - // ------------------- - // Parameters that affect performance - - // Amount of data to build up in memory (backed by an unsorted log - // on disk) before converting to a sorted on-disk file. - // - // Larger values increase performance, especially during bulk loads. - // Up to two write buffers may be held in memory at the same time, - // so you may wish to adjust this parameter to control memory usage. - // - // Default: 4MB - size_t write_buffer_size; - - // Number of open files that can be used by the DB. You may need to - // increase this if your database has a large working set (budget - // one open file per 2MB of working set). - // - // Default: 1000 - int max_open_files; - - // Handle values larger than "large_value_threshold" bytes - // specially, by writing them into their own files (to avoid - // compaction overhead) and doing content-based elimination of - // duplicate values to save space. - // - // We recommend against changing this value. - // - // Default: 64K - size_t large_value_threshold; - - // Control over blocks (user data is stored in a set of blocks, and - // a block is the unit of reading from disk). - - // If non-NULL, use the specified cache for blocks. - // If NULL, leveldb will automatically create and use an 8MB internal cache. - // Default: NULL - Cache* block_cache; - - // Approximate size of user data packed per block. Note that the - // block size specified here corresponds to uncompressed data. The - // actual size of the unit read from disk may be smaller if - // compression is enabled. This parameter can be changed dynamically. - // - // Default: 4K - int block_size; - - // Number of keys between restart points for delta encoding of keys. - // This parameter can be changed dynamically. Most clients should - // leave this parameter alone. - // - // Default: 16 - int block_restart_interval; - - // Compress blocks using the specified compression algorithm. This - // parameter can be changed dynamically. - // - // Default: kSnappyCompression, which gives lightweight but fast - // compression. - // - // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: - // ~200-500MB/s compression - // ~400-800MB/s decompression - // Note that these speeds are significantly faster than most - // persistent storage speeds, and therefore it is typically never - // worth switching to kNoCompression. Even if the input data is - // incompressible, the kSnappyCompression implementation will - // efficiently detect that and will switch to uncompressed mode. - CompressionType compression; - - // Create an Options object with default values for all fields. - Options(); -}; - -// Options that control read operations -struct ReadOptions { - // If true, all data read from underlying storage will be - // verified against corresponding checksums. - // Default: false - bool verify_checksums; - - // Should the data read for this iteration be cached in memory? - // Callers may wish to set this field to false for bulk scans. - // Default: true - bool fill_cache; - - // If "snapshot" is non-NULL, read as of the supplied snapshot - // (which must belong to the DB that is being read and which must - // not have been released). If "snapshot" is NULL, use an impliicit - // snapshot of the state at the beginning of this read operation. - // Default: NULL - const Snapshot* snapshot; - - ReadOptions() - : verify_checksums(false), - fill_cache(true), - snapshot(NULL) { - } -}; - -// Options that control write operations -struct WriteOptions { - // If true, the write will be flushed from the operating system - // buffer cache (by calling WritableFile::Sync()) before the write - // is considered complete. If this flag is true, writes will be - // slower. - // - // If this flag is false, and the machine crashes, some recent - // writes may be lost. Note that if it is just the process that - // crashes (i.e., the machine does not reboot), no writes will be - // lost even if sync==false. - // - // In other words, a DB write with sync==false has similar - // crash semantics as the "write()" system call. A DB write - // with sync==true has similar crash semantics to a "write()" - // system call followed by "fsync()". - // - // Default: false - bool sync; - - // If "post_write_snapshot" is non-NULL, and the write succeeds, - // *post_write_snapshot will be modified to point to a snapshot of - // the DB state immediately after this write. The caller must call - // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the - // snapshot is no longer needed. - // - // If "post_write_snapshot" is non-NULL, and the write fails, - // *post_write_snapshot will be set to NULL. - // - // Default: NULL - const Snapshot** post_write_snapshot; - - WriteOptions() - : sync(false), - post_write_snapshot(NULL) { - } -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h deleted file mode 100644 index 62cb894..0000000 --- a/include/leveldb/slice.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Slice is a simple structure containing a pointer into some external -// storage and a size. The user of a Slice must ensure that the slice -// is not used after the corresponding external storage has been -// deallocated. - -#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ -#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ - -#include -#include -#include -#include - -namespace leveldb { - -class Slice { - public: - // Create an empty slice. - Slice() : data_(""), size_(0) { } - - // Create a slice that refers to data[0,n-1]. - Slice(const char* data, size_t n) : data_(data), size_(n) { } - - // Create a slice that refers to the contents of "s" - Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } - - // Create a slice that refers to s[0,strlen(s)-1] - Slice(const char* s) : data_(s), size_(strlen(s)) { } - - // Return a pointer to the beginning of the referenced data - const char* data() const { return data_; } - - // Return the length (in bytes) of the referenced data - size_t size() const { return size_; } - - // Return true iff the length of the referenced data is zero - bool empty() const { return size_ == 0; } - - // Return the ith byte in the referenced data. - // REQUIRES: n < size() - char operator[](size_t n) const { - assert(n < size()); - return data_[n]; - } - - // Change this slice to refer to an empty array - void clear() { data_ = ""; size_ = 0; } - - // Drop the first "n" bytes from this slice. - void remove_prefix(size_t n) { - assert(n <= size()); - data_ += n; - size_ -= n; - } - - // Return a string that contains the copy of the referenced data. - std::string ToString() const { return std::string(data_, size_); } - - // Three-way comparison. Returns value: - // < 0 iff "*this" < "b", - // == 0 iff "*this" == "b", - // > 0 iff "*this" > "b" - int compare(const Slice& b) const; - - // Return true iff "x" is a prefix of "*this" - bool starts_with(const Slice& x) const { - return ((size_ >= x.size_) && - (memcmp(data_, x.data_, x.size_) == 0)); - } - - private: - const char* data_; - size_t size_; - - // Intentionally copyable -}; - -inline bool operator==(const Slice& x, const Slice& y) { - return ((x.size() == y.size()) && - (memcmp(x.data(), y.data(), x.size()) == 0)); -} - -inline bool operator!=(const Slice& x, const Slice& y) { - return !(x == y); -} - -inline int Slice::compare(const Slice& b) const { - const int min_len = (size_ < b.size_) ? size_ : b.size_; - int r = memcmp(data_, b.data_, min_len); - if (r == 0) { - if (size_ < b.size_) r = -1; - else if (size_ > b.size_) r = +1; - } - return r; -} - -} - - -#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/include/leveldb/status.h b/include/leveldb/status.h deleted file mode 100644 index 47e3edf..0000000 --- a/include/leveldb/status.h +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Status encapsulates the result of an operation. It may indicate success, -// or it may indicate an error with an associated error message. - -#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ -#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ - -#include -#include -#include "leveldb/slice.h" - -namespace leveldb { - -class Status { - public: - // Create a success status. - Status() : state_(NULL) { } - ~Status() { delete state_; } - - // Copy the specified status. - Status(const Status& s); - void operator=(const Status& s); - - // Return a success status. - static Status OK() { return Status(); } - - // Return error status of an appropriate type. - static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kNotFound, msg, Slice()); - } - static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kCorruption, msg, msg2); - } - static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kNotSupported, msg, msg2); - } - static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kInvalidArgument, msg, msg2); - } - static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kIOError, msg, msg2); - } - - // Returns true iff the status indicates success. - bool ok() const { return (state_ == NULL); } - - // Returns true iff the status indicates a NotFound error. - bool IsNotFound() const { return code() == kNotFound; } - - // Return a string representation of this status suitable for printing. - // Returns the string "OK" for success. - std::string ToString() const; - - private: - enum Code { - kOk = 0, - kNotFound = 1, - kCorruption = 2, - kNotSupported = 3, - kInvalidArgument = 4, - kIOError = 5, - }; - Code code() const { return (state_ == NULL) ? kOk : state_->first; } - - Status(Code code, const Slice& msg, const Slice& msg2); - - typedef std::pair State; - State* state_; -}; - -inline Status::Status(const Status& s) { - state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); -} -inline void Status::operator=(const Status& s) { - if (this != &s) { - delete state_; - state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); - } -} - -} - -#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/include/leveldb/table.h b/include/leveldb/table.h deleted file mode 100644 index bd99176..0000000 --- a/include/leveldb/table.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ -#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ - -#include -#include "leveldb/iterator.h" - -namespace leveldb { - -class Block; -class BlockHandle; -struct Options; -class RandomAccessFile; -struct ReadOptions; - -// A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. -class Table { - public: - // Attempt to open the table that is stored in bytes [0..file_size) - // of "file", and read the metadata entries necessary to allow - // retrieving data from the table. - // - // If successful, returns ok and sets "*table" to the newly opened - // table. The client should delete "*table" when no longer needed. - // If there was an error while initializing the table, sets "*table" - // to NULL and returns a non-ok status. Does not take ownership of - // "*source", but the client must ensure that "source" remains live - // for the duration of the returned table's lifetime. - // - // *file must remain live while this Table is in use. - static Status Open(const Options& options, - RandomAccessFile* file, - uint64_t file_size, - Table** table); - - ~Table(); - - // Returns a new iterator over the table contents. - // The result of NewIterator() is initially invalid (caller must - // call one of the Seek methods on the iterator before using it). - Iterator* NewIterator(const ReadOptions&) const; - - // Given a key, return an approximate byte offset in the file where - // the data for that key begins (or would begin if the key were - // present in the file). The returned value is in terms of file - // bytes, and so includes effects like compression of the underlying data. - // E.g., the approximate offset of the last key in the table will - // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key) const; - - private: - struct Rep; - Rep* rep_; - - explicit Table(Rep* rep) { rep_ = rep; } - static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); - - // No copying allowed - Table(const Table&); - void operator=(const Table&); -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h deleted file mode 100644 index 49d2d51..0000000 --- a/include/leveldb/table_builder.h +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// TableBuilder provides the interface used to build a Table -// (an immutable and sorted map from keys to values). - -#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ -#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ - -#include -#include "leveldb/options.h" -#include "leveldb/status.h" - -namespace leveldb { - -class BlockBuilder; -class BlockHandle; -class WritableFile; - -class TableBuilder { - public: - // Create a builder that will store the contents of the table it is - // building in *file. Does not close the file. It is up to the - // caller to close the file after calling Finish(). - TableBuilder(const Options& options, WritableFile* file); - - // REQUIRES: Either Finish() or Abandon() has been called. - ~TableBuilder(); - - // Change the options used by this builder. Note: only some of the - // option fields can be changed after construction. If a field is - // not allowed to change dynamically and its value in the structure - // passed to the constructor is different from its value in the - // structure passed to this method, this method will return an error - // without changing any fields. - Status ChangeOptions(const Options& options); - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value); - - // Advanced operation: flush any buffered key/value pairs to file. - // Can be used to ensure that two adjacent entries never live in - // the same data block. Most clients should not need to use this method. - // REQUIRES: Finish(), Abandon() have not been called - void Flush(); - - // Return non-ok iff some error has been detected. - Status status() const; - - // Finish building the table. Stops using the file passed to the - // constructor after this function returns. - // REQUIRES: Finish(), Abandon() have not been called - Status Finish(); - - // Indicate that the contents of this builder should be abandoned. Stops - // using the file passed to the constructor after this function returns. - // If the caller is not going to call Finish(), it must call Abandon() - // before destroying this builder. - // REQUIRES: Finish(), Abandon() have not been called - void Abandon(); - - // Number of calls to Add() so far. - uint64_t NumEntries() const; - - // Size of the file generated so far. If invoked after a successful - // Finish() call, returns the size of the final generated file. - uint64_t FileSize() const; - - private: - bool ok() const { return status().ok(); } - void WriteBlock(BlockBuilder* block, BlockHandle* handle); - - struct Rep; - Rep* rep_; - - // No copying allowed - TableBuilder(const TableBuilder&); - void operator=(const TableBuilder&); -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h deleted file mode 100644 index 3411952..0000000 --- a/include/leveldb/write_batch.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// WriteBatch holds a collection of updates to apply atomically to a DB. -// -// The updates are applied in the order in which they are added -// to the WriteBatch. For example, the value of "key" will be "v3" -// after the following batch is written: -// -// batch.Put("key", "v1"); -// batch.Delete("key"); -// batch.Put("key", "v2"); -// batch.Put("key", "v3"); - -#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ -#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ - -#include - -namespace leveldb { - -class Slice; - -class WriteBatch { - public: - WriteBatch(); - ~WriteBatch(); - - // Store the mapping "key->value" in the database. - void Put(const Slice& key, const Slice& value); - - // If the database contains a mapping for "key", erase it. Else do nothing. - void Delete(const Slice& key); - - // Clear all updates buffered in this batch. - void Clear(); - - private: - friend class WriteBatchInternal; - - std::string rep_; // See comment in write_batch.cc for the format of rep_ - - // Intentionally copyable -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/leveldb.gyp b/leveldb.gyp deleted file mode 100644 index d10ac33..0000000 --- a/leveldb.gyp +++ /dev/null @@ -1,327 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -{ - 'variables': { - 'use_snappy%': 0, - }, - 'target_defaults': { - 'defines': [ - 'LEVELDB_PLATFORM_CHROMIUM=1', - ], - 'include_dirs': [ - '.', - 'include/', - ], - 'conditions': [ - ['OS == "win"', { - 'include_dirs': [ - 'port/win', - ], - }], - ['use_snappy', { - 'defines': [ - 'USE_SNAPPY=1', - ], - }], - ], - }, - 'targets': [ - { - 'target_name': 'leveldb', - 'type': '<(library)', - 'dependencies': [ - # The base libary is a lightweight abstraction layer for things like - # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ - '../../base/base.gyp:base', - ], - 'conditions': [ - ['use_snappy', { - 'dependencies': [ - '../../third_party/snappy/snappy.gyp:snappy', - ], - }], - ], - 'direct_dependent_settings': { - 'include_dirs': [ - 'include/', - ], - }, - 'sources': [ - # Include and then exclude so that all files show up in IDEs, even if - # they don't build. - 'db/builder.cc', - 'db/builder.h', - 'db/db_impl.cc', - 'db/db_impl.h', - 'db/db_iter.cc', - 'db/db_iter.h', - 'db/filename.cc', - 'db/filename.h', - 'db/dbformat.cc', - 'db/dbformat.h', - 'db/log_format.h', - 'db/log_reader.cc', - 'db/log_reader.h', - 'db/log_writer.cc', - 'db/log_writer.h', - 'db/memtable.cc', - 'db/memtable.h', - 'db/repair.cc', - 'db/skiplist.h', - 'db/snapshot.h', - 'db/table_cache.cc', - 'db/table_cache.h', - 'db/version_edit.cc', - 'db/version_edit.h', - 'db/version_set.cc', - 'db/version_set.h', - 'db/write_batch.cc', - 'db/write_batch_internal.h', - 'include/leveldb/cache.h', - 'include/leveldb/comparator.h', - 'include/leveldb/db.h', - 'include/leveldb/env.h', - 'include/leveldb/iterator.h', - 'include/leveldb/options.h', - 'include/leveldb/slice.h', - 'include/leveldb/status.h', - 'include/leveldb/table.h', - 'include/leveldb/table_builder.h', - 'include/leveldb/write_batch.h', - 'port/port.h', - 'port/port_chromium.cc', - 'port/port_chromium.h', - 'port/port_example.h', - 'port/port_posix.cc', - 'port/port_posix.h', - 'port/sha1_portable.cc', - 'port/sha1_portable.h', - 'table/block.cc', - 'table/block.h', - 'table/block_builder.cc', - 'table/block_builder.h', - 'table/format.cc', - 'table/format.h', - 'table/iterator.cc', - 'table/iterator_wrapper.h', - 'table/merger.cc', - 'table/merger.h', - 'table/table.cc', - 'table/table_builder.cc', - 'table/two_level_iterator.cc', - 'table/two_level_iterator.h', - 'util/arena.cc', - 'util/arena.h', - 'util/cache.cc', - 'util/coding.cc', - 'util/coding.h', - 'util/comparator.cc', - 'util/crc32c.cc', - 'util/crc32c.h', - 'util/env.cc', - 'util/env_chromium.cc', - 'util/env_posix.cc', - 'util/hash.cc', - 'util/hash.h', - 'util/logging.cc', - 'util/logging.h', - 'util/mutexlock.h', - 'util/options.cc', - 'util/random.h', - 'util/status.cc', - ], - 'sources/': [ - ['exclude', '_(android|example|portable|posix)\\.cc$'], - ], - }, - { - 'target_name': 'leveldb_testutil', - 'type': '<(library)', - 'dependencies': [ - '../../base/base.gyp:base', - 'leveldb', - ], - 'export_dependent_settings': [ - # The tests use include directories from these projects. - '../../base/base.gyp:base', - 'leveldb', - ], - 'sources': [ - 'util/histogram.cc', - 'util/histogram.h', - 'util/testharness.cc', - 'util/testharness.h', - 'util/testutil.cc', - 'util/testutil.h', - ], - }, - { - 'target_name': 'leveldb_arena_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/arena_test.cc', - ], - }, - { - 'target_name': 'leveldb_cache_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/cache_test.cc', - ], - }, - { - 'target_name': 'leveldb_coding_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/coding_test.cc', - ], - }, - { - 'target_name': 'leveldb_corruption_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/corruption_test.cc', - ], - }, - { - 'target_name': 'leveldb_crc32c_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/crc32c_test.cc', - ], - }, - { - 'target_name': 'leveldb_db_bench', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/db_bench.cc', - ], - }, - { - 'target_name': 'leveldb_db_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/db_test.cc', - ], - }, - { - 'target_name': 'leveldb_dbformat_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/dbformat_test.cc', - ], - }, - { - 'target_name': 'leveldb_env_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/env_test.cc', - ], - }, - { - 'target_name': 'leveldb_filename_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/filename_test.cc', - ], - }, - { - 'target_name': 'leveldb_log_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/log_test.cc', - ], - }, - { - 'target_name': 'leveldb_sha1_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'port/sha1_test.cc', - ], - }, - { - 'target_name': 'leveldb_skiplist_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/skiplist_test.cc', - ], - }, - { - 'target_name': 'leveldb_table_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'table/table_test.cc', - ], - }, - { - 'target_name': 'leveldb_version_edit_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/version_edit_test.cc', - ], - }, - { - 'target_name': 'leveldb_write_batch_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/write_batch_test.cc', - ], - }, - ], -} - -# Local Variables: -# tab-width:2 -# indent-tabs-mode:nil -# End: -# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/leveldb/AUTHORS b/leveldb/AUTHORS new file mode 100644 index 0000000..27a9407 --- /dev/null +++ b/leveldb/AUTHORS @@ -0,0 +1,8 @@ +# Names should be added to this file like so: +# Name or Organization + +Google Inc. + +# Initial version authors: +Jeffrey Dean +Sanjay Ghemawat diff --git a/leveldb/LICENSE b/leveldb/LICENSE new file mode 100644 index 0000000..8e80208 --- /dev/null +++ b/leveldb/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The LevelDB Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/leveldb/Makefile b/leveldb/Makefile new file mode 100644 index 0000000..43ac23d --- /dev/null +++ b/leveldb/Makefile @@ -0,0 +1,129 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +CC = g++ + +# Uncomment one of the following to switch between debug and opt mode +#OPT = -O2 -DNDEBUG +OPT = -g2 + +CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -I./include -std=c++0x $(OPT) + +LDFLAGS=-lpthread + +LIBOBJECTS = \ + ./db/builder.o \ + ./db/db_impl.o \ + ./db/db_iter.o \ + ./db/filename.o \ + ./db/dbformat.o \ + ./db/log_reader.o \ + ./db/log_writer.o \ + ./db/memtable.o \ + ./db/repair.o \ + ./db/table_cache.o \ + ./db/version_edit.o \ + ./db/version_set.o \ + ./db/write_batch.o \ + ./port/port_posix.o \ + ./table/block.o \ + ./table/block_builder.o \ + ./table/format.o \ + ./table/iterator.o \ + ./table/merger.o \ + ./table/table.o \ + ./table/table_builder.o \ + ./table/two_level_iterator.o \ + ./util/arena.o \ + ./util/cache.o \ + ./util/coding.o \ + ./util/comparator.o \ + ./util/crc32c.o \ + ./util/env.o \ + ./util/env_posix.o \ + ./util/hash.o \ + ./util/histogram.o \ + ./util/logging.o \ + ./util/options.o \ + ./util/status.o + +TESTUTIL = ./util/testutil.o +TESTHARNESS = ./util/testharness.o $(TESTUTIL) + +TESTS = \ + arena_test \ + cache_test \ + coding_test \ + corruption_test \ + crc32c_test \ + db_test \ + dbformat_test \ + env_test \ + filename_test \ + log_test \ + skiplist_test \ + table_test \ + version_edit_test \ + write_batch_test + +PROGRAMS = db_bench $(TESTS) + +all: $(PROGRAMS) + +check: $(TESTS) + for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done + +clean: + rm -f $(PROGRAMS) */*.o + +db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + +arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +.cc.o: + $(CC) $(CFLAGS) $< -o $@ + +# TODO(gabor): dependencies for .o files +# TODO(gabor): Build library diff --git a/leveldb/README b/leveldb/README new file mode 100644 index 0000000..3618ade --- /dev/null +++ b/leveldb/README @@ -0,0 +1,51 @@ +leveldb: A key-value store +Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) + +The code under this directory implements a system for maintaining a +persistent key/value store. + +See doc/index.html for more explanation. +See doc/impl.html for a brief overview of the implementation. + +The public interface is in include/*.h. Callers should not include or +rely on the details of any other header files in this package. Those +internal APIs may be changed without warning. + +Guide to header files: + +include/db.h + Main interface to the DB: Start here + +include/options.h + Control over the behavior of an entire database, and also + control over the behavior of individual reads and writes. + +include/comparator.h + Abstraction for user-specified comparison function. If you want + just bytewise comparison of keys, you can use the default comparator, + but clients can write their own comparator implementations if they + want custom ordering (e.g. to handle different character + encodings, etc.) + +include/iterator.h + Interface for iterating over data. You can get an iterator + from a DB object. + +include/write_batch.h + Interface for atomically applying multiple updates to a database. + +include/slice.h + A simple module for maintaining a pointer and a length into some + other byte array. + +include/status.h + Status is returned from many of the public interfaces and is used + to report success and various kinds of errors. + +include/env.h + Abstraction of the OS environment. A posix implementation of + this interface is in util/env_posix.cc + +include/table.h +include/table_builder.h + Lower-level modules that most clients probably won't use directly diff --git a/leveldb/TODO b/leveldb/TODO new file mode 100644 index 0000000..ce81439 --- /dev/null +++ b/leveldb/TODO @@ -0,0 +1,14 @@ +ss +- Stats + +db +- Maybe implement DB::BulkDeleteForRange(start_key, end_key) + that would blow away files whose ranges are entirely contained + within [start_key..end_key]? For Chrome, deletion of obsolete + object stores, etc. can be done in the background anyway, so + probably not that important. + +api changes: +- Make it wrappable + +Faster Get implementation diff --git a/leveldb/db/builder.cc b/leveldb/db/builder.cc new file mode 100644 index 0000000..9f132d7 --- /dev/null +++ b/leveldb/db/builder.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" + +namespace leveldb { + +Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + VersionEdit* edit) { + Status s; + meta->file_size = 0; + iter->SeekToFirst(); + + std::string fname = TableFileName(dbname, meta->number); + if (iter->Valid()) { + WritableFile* file; + s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + + TableBuilder* builder = new TableBuilder(options, file); + meta->smallest.DecodeFrom(iter->key()); + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + meta->largest.DecodeFrom(key); + builder->Add(key, iter->value()); + } + + // Finish and check for builder errors + if (s.ok()) { + s = builder->Finish(); + if (s.ok()) { + meta->file_size = builder->FileSize(); + assert(meta->file_size > 0); + } + } else { + builder->Abandon(); + } + delete builder; + + // Finish and check for file errors + if (s.ok()) { + s = file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + delete file; + file = NULL; + + if (s.ok()) { + // Verify that the table is usable + Iterator* it = table_cache->NewIterator(ReadOptions(), + meta->number, + meta->file_size); + s = it->status(); + delete it; + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (s.ok() && meta->file_size > 0) { + edit->AddFile(0, meta->number, meta->file_size, + meta->smallest, meta->largest); + } else { + env->DeleteFile(fname); + } + return s; +} + +} diff --git a/leveldb/db/builder.h b/leveldb/db/builder.h new file mode 100644 index 0000000..5dd17b6 --- /dev/null +++ b/leveldb/db/builder.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ +#define STORAGE_LEVELDB_DB_BUILDER_H_ + +#include "leveldb/status.h" + +namespace leveldb { + +struct Options; +struct FileMetaData; + +class Env; +class Iterator; +class TableCache; +class VersionEdit; + +// Build a Table file from the contents of *iter. The generated file +// will be named according to meta->number. On success, the rest of +// *meta will be filled with metadata about the generated table, and +// the file information will be added to *edit. If no data is present +// in *iter, meta->file_size will be set to zero, and no Table file +// will be produced. +extern Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + VersionEdit* edit); + +} + +#endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/leveldb/db/corruption_test.cc b/leveldb/db/corruption_test.cc new file mode 100644 index 0000000..12d176e --- /dev/null +++ b/leveldb/db/corruption_test.cc @@ -0,0 +1,354 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include +#include +#include +#include +#include "leveldb/cache.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "leveldb/write_batch.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static const int kValueSize = 1000; + +class CorruptionTest { + public: + test::ErrorEnv env_; + Random rnd_; + std::string dbname_; + Cache* tiny_cache_; + Options options_; + DB* db_; + + CorruptionTest() : rnd_(test::RandomSeed()) { + tiny_cache_ = NewLRUCache(100); + options_.env = &env_; + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, options_); + + db_ = NULL; + options_.create_if_missing = true; + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() { + delete db_; + DestroyDB(dbname_, Options()); + delete tiny_cache_; + } + + Status TryReopen(Options* options = NULL) { + delete db_; + db_ = NULL; + Options opt = (options ? *options : options_); + opt.env = &env_; + opt.block_cache = tiny_cache_; + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); + } + + void RepairDB() { + delete db_; + db_ = NULL; + ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); + } + + void Build(int n) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Check(int min_expected, int max_expected) { + int next_expected = 0; + int missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || + !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(key, &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + + fprintf(stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", + min_expected, max_expected, correct, bad_keys, bad_values, missed); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::vector candidates; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type == filetype) { + candidates.push_back(dbname_ + "/" + filenames[i]); + } + } + ASSERT_TRUE(!candidates.empty()) << filetype; + std::string fname = candidates[rnd_.Uniform(candidates.size())]; + + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + ASSERT_TRUE(false) << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = sbuf.st_size + offset; + } + } + if (offset > sbuf.st_size) { + offset = sbuf.st_size; + } + if (offset + bytes_to_corrupt > sbuf.st_size) { + bytes_to_corrupt = sbuf.st_size - offset; + } + + // Do it + std::string contents; + Status s = ReadFileToString(Env::Default(), fname, &contents); + ASSERT_TRUE(s.ok()) << s.ToString(); + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(Env::Default(), contents, fname); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + int Property(const std::string& name) { + std::string property; + int result; + if (db_->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } +}; + +TEST(CorruptionTest, Recovery) { + Build(100); + Check(100, 100); + Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record + Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + Reopen(); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); +} + +TEST(CorruptionTest, RecoverWriteError) { + env_.writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_.writable_file_error_ = true; + const int num = 3 + (Options().write_buffer_size / kValueSize); + std::string value_storage; + Status s; + for (int i = 0; s.ok() && i < num; i++) { + WriteBatch batch; + batch.Put("a", Value(100, &value_storage)); + s = db_->Write(WriteOptions(), &batch); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_.num_writable_file_errors_, 1); + env_.writable_file_error_ = false; + Reopen(); +} + +TEST(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(1, "", "~"); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); +} + +TEST(CorruptionTest, TableFileIndexData) { + Build(10000); // Enough to build multiple Tables + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(1, "", "~"); + + Corrupt(kTableFile, -2000, 500); + Reopen(); + Check(5000, 9999); +} + +TEST(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST(CorruptionTest, CompactionInputError) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); + dbi->TEST_CompactRange(0, "", "~"); + ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); +} + +TEST(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.paranoid_checks = true; + options.write_buffer_size = 1048576; + Reopen(&options); + + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + for (int i = 0; i < 10000 && s.ok(); i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + Corrupt(kTableFile, 100, 1); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + dbi->TEST_CompactMemTable(); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/db/db_bench.cc b/leveldb/db/db_bench.cc new file mode 100644 index 0000000..d1cbdc0 --- /dev/null +++ b/leveldb/db/db_bench.cc @@ -0,0 +1,613 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "leveldb/cache.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/write_batch.h" +#include "port/port.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/random.h" +#include "util/testutil.h" + +// Comma-separated list of operations to run in the specified order +// Actual benchmarks: +// fillseq -- write N values in sequential key order in async mode +// fillrandom -- write N values in random key order in async mode +// overwrite -- overwrite N values in random key order in async mode +// fillsync -- write N/100 values in random key order in sync mode +// fill100K -- write N/1000 100K values in random order in async mode +// readseq -- read N values sequentially +// readreverse -- read N values in reverse order +// readrandom -- read N values in random order +// crc32c -- repeated crc32c of 4K of data +// Meta operations: +// compact -- Compact the entire DB +// stats -- Print DB stats +// heapprofile -- Dump a heap profile (if supported by this port) +static const char* FLAGS_benchmarks = + "fillseq," + "fillsync," + "fillrandom," + "overwrite," + "readrandom," + "readrandom," // Extra run to allow previous compactions to quiesce + "readseq," + "readreverse," + "compact," + "readrandom," + "readseq," + "readreverse," + "fill100K," + "crc32c," + "snappycomp," + "snappyuncomp," + ; + +// Number of key/values to place in database +static int FLAGS_num = 1000000; + +// Size of each value +static int FLAGS_value_size = 100; + +// Arrange to generate values that shrink to this fraction of +// their original size after compression +static double FLAGS_compression_ratio = 0.5; + +// Print histogram of operation timings +static bool FLAGS_histogram = false; + +// Number of bytes to buffer in memtable before compacting +// (initialized to default value by "main") +static int FLAGS_write_buffer_size = 0; + +// Number of bytes to use as a cache of uncompressed data. +// Negative means use default settings. +static int FLAGS_cache_size = -1; + +namespace leveldb { + +// Helper for quickly generating random data. +namespace { +class RandomGenerator { + private: + std::string data_; + int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < 1048576) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +static Slice TrimSpace(Slice s) { + int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); +} + +} + +class Benchmark { + private: + Cache* cache_; + DB* db_; + int num_; + int heap_counter_; + double start_; + double last_op_finish_; + int64_t bytes_; + std::string message_; + std::string post_message_; + Histogram hist_; + RandomGenerator gen_; + Random rand_; + + // State kept for progress messages + int done_; + int next_report_; // When to report next + + void PrintHeader() { + const int kKeySize = 16; + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", kKeySize); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %d\n", num_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast(kKeySize + FLAGS_value_size) * num_) + / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) + / 1048576.0)); + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + + // See if snappy is working by attempting to compress a compressible string + const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; + std::string compressed; + if (!port::Snappy_Compress(text, sizeof(text), &compressed)) { + fprintf(stdout, "WARNING: Snappy compression is not enabled\n"); + } else if (compressed.size() >= sizeof(text)) { + fprintf(stdout, "WARNING: Snappy compression is not effective\n"); + } + } + + void PrintEnvironment() { + fprintf(stderr, "LevelDB: version %d.%d\n", + kMajorVersion, kMinorVersion); + +#if defined(__linux) + time_t now = time(NULL); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != NULL) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != NULL) { + const char* sep = strchr(line, ':'); + if (sep == NULL) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + + void Start() { + start_ = Env::Default()->NowMicros() * 1e-6; + bytes_ = 0; + message_.clear(); + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + next_report_ = 100; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = Env::Default()->NowMicros() * 1e-6; + double micros = (now - last_op_finish_) * 1e6; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stderr, "... finished %d ops%30s\r", done_, ""); + fflush(stderr); + } + } + + void Stop(const Slice& name) { + double finish = Env::Default()->NowMicros() * 1e-6; + + // Pretend at least one op was done in case we are running a benchmark + // that does nto call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + if (bytes_ > 0) { + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / (finish - start_)); + if (!message_.empty()) { + message_ = std::string(rate) + " " + message_; + } else { + message_ = rate; + } + } + + fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", + name.ToString().c_str(), + (finish - start_) * 1e6 / done_, + (message_.empty() ? "" : " "), + message_.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + + if (!post_message_.empty()) { + fprintf(stdout, "\n%s\n", post_message_.c_str()); + post_message_.clear(); + } + } + + public: + enum Order { + SEQUENTIAL, + RANDOM + }; + enum DBState { + FRESH, + EXISTING + }; + + Benchmark() + : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), + db_(NULL), + num_(FLAGS_num), + heap_counter_(0), + bytes_(0), + rand_(301) { + std::vector files; + Env::Default()->GetChildren("/tmp/dbbench", &files); + for (int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); + } + } + DestroyDB("/tmp/dbbench", Options()); + } + + ~Benchmark() { + delete db_; + delete cache_; + } + + void Run() { + PrintHeader(); + Open(); + + const char* benchmarks = FLAGS_benchmarks; + while (benchmarks != NULL) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == NULL) { + name = benchmarks; + benchmarks = NULL; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + Start(); + + WriteOptions write_options; + bool known = true; + if (name == Slice("fillseq")) { + Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); + } else if (name == Slice("fillbatch")) { + Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000); + } else if (name == Slice("fillrandom")) { + Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1); + } else if (name == Slice("overwrite")) { + Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1); + } else if (name == Slice("fillsync")) { + write_options.sync = true; + Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); + } else if (name == Slice("fill100K")) { + Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); + } else if (name == Slice("readseq")) { + ReadSequential(); + } else if (name == Slice("readreverse")) { + ReadReverse(); + } else if (name == Slice("readrandom")) { + ReadRandom(); + } else if (name == Slice("readrandomsmall")) { + int n = num_; + num_ /= 1000; + ReadRandom(); + num_ = n; + } else if (name == Slice("compact")) { + Compact(); + } else if (name == Slice("crc32c")) { + Crc32c(4096, "(4K per op)"); + } else if (name == Slice("snappycomp")) { + SnappyCompress(); + } else if (name == Slice("snappyuncomp")) { + SnappyUncompress(); + } else if (name == Slice("heapprofile")) { + HeapProfile(); + } else if (name == Slice("stats")) { + PrintStats(); + } else { + known = false; + if (name != Slice()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + } + } + if (known) { + Stop(name); + } + } + } + + private: + void Crc32c(int size, const char* label) { + // Checksum about 500MB of data total + std::string data(size, 'x'); + int64_t bytes = 0; + uint32_t crc = 0; + while (bytes < 500 * 1048576) { + crc = crc32c::Value(data.data(), size); + FinishedSingleOp(); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); + + bytes_ = bytes; + message_ = label; + } + + void SnappyCompress() { + Slice input = gen_.Generate(Options().block_size); + int64_t bytes = 0; + int64_t produced = 0; + bool ok = true; + std::string compressed; + while (ok && bytes < 1024 * 1048576) { // Compress 1G + ok = port::Snappy_Compress(input.data(), input.size(), &compressed); + produced += compressed.size(); + bytes += input.size(); + FinishedSingleOp(); + } + + if (!ok) { + message_ = "(snappy failure)"; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "(output: %.1f%%)", + (produced * 100.0) / bytes); + message_ = buf; + bytes_ = bytes; + } + } + + void SnappyUncompress() { + Slice input = gen_.Generate(Options().block_size); + std::string compressed; + bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); + int64_t bytes = 0; + std::string uncompressed; + while (ok && bytes < 1024 * 1048576) { // Compress 1G + ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), + &uncompressed); + bytes += uncompressed.size(); + FinishedSingleOp(); + } + + if (!ok) { + message_ = "(snappy failure)"; + } else { + bytes_ = bytes; + } + } + + void Open() { + assert(db_ == NULL); + Options options; + options.create_if_missing = true; + options.block_cache = cache_; + options.write_buffer_size = FLAGS_write_buffer_size; + Status s = DB::Open(options, "/tmp/dbbench", &db_); + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void Write(const WriteOptions& options, Order order, DBState state, + int num_entries, int value_size, int entries_per_batch) { + if (state == FRESH) { + delete db_; + db_ = NULL; + DestroyDB("/tmp/dbbench", Options()); + Open(); + Start(); // Do not count time taken to destroy/open + } + + if (num_entries != num_) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%d ops)", num_entries); + message_ = msg; + } + + WriteBatch batch; + Status s; + std::string val; + for (int i = 0; i < num_entries; i += entries_per_batch) { + batch.Clear(); + for (int j = 0; j < entries_per_batch; j++) { + const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num); + char key[100]; + snprintf(key, sizeof(key), "%016d", k); + batch.Put(key, gen_.Generate(value_size)); + bytes_ += value_size + strlen(key); + FinishedSingleOp(); + } + s = db_->Write(options, &batch); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + } + } + + void ReadSequential() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadReverse() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadRandom() { + ReadOptions options; + std::string value; + for (int i = 0; i < num_; i++) { + char key[100]; + const int k = rand_.Next() % FLAGS_num; + snprintf(key, sizeof(key), "%016d", k); + db_->Get(options, key, &value); + FinishedSingleOp(); + } + } + + void Compact() { + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + std::string property; + char name[100]; + snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); + if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbi->TEST_CompactRange(level, "", "~"); + } + } + + void PrintStats() { + std::string stats; + if (!db_->GetProperty("leveldb.stats", &stats)) { + message_ = "(failed)"; + } else { + post_message_ = stats; + } + } + + static void WriteToFile(void* arg, const char* buf, int n) { + reinterpret_cast(arg)->Append(Slice(buf, n)); + } + + void HeapProfile() { + char fname[100]; + snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); + WritableFile* file; + Status s = Env::Default()->NewWritableFile(fname, &file); + if (!s.ok()) { + message_ = s.ToString(); + return; + } + bool ok = port::GetHeapProfile(WriteToFile, file); + delete file; + if (!ok) { + message_ = "not supported"; + Env::Default()->DeleteFile(fname); + } + } +}; + +} + +int main(int argc, char** argv) { + FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; + for (int i = 1; i < argc; i++) { + double d; + int n; + char junk; + if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { + FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); + } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { + FLAGS_compression_ratio = d; + } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_histogram = n; + } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { + FLAGS_num = n; + } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { + FLAGS_value_size = n; + } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { + FLAGS_cache_size = n; + } else { + fprintf(stderr, "Invalid flag '%s'\n", argv[i]); + exit(1); + } + } + + leveldb::Benchmark benchmark; + benchmark.Run(); + return 0; +} diff --git a/leveldb/db/db_impl.cc b/leveldb/db/db_impl.cc new file mode 100644 index 0000000..3b9e04e --- /dev/null +++ b/leveldb/db/db_impl.cc @@ -0,0 +1,1188 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" + +#include +#include +#include +#include +#include +#include +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/status.h" +#include "leveldb/table.h" +#include "leveldb/table_builder.h" +#include "port/port.h" +#include "table/block.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/mutexlock.h" + +namespace leveldb { + +struct DBImpl::CompactionState { + Compaction* const compaction; + + // Sequence numbers < smallest_snapshot are not significant since we + // will never have to service a snapshot below smallest_snapshot. + // Therefore if we have seen a sequence number S <= smallest_snapshot, + // we can drop all entries for the same key with sequence numbers < S. + SequenceNumber smallest_snapshot; + + // Files produced by compaction + struct Output { + uint64_t number; + uint64_t file_size; + InternalKey smallest, largest; + }; + std::vector outputs; + + // State kept for output being generated + WritableFile* outfile; + TableBuilder* builder; + + uint64_t total_bytes; + + Output* current_output() { return &outputs[outputs.size()-1]; } + + explicit CompactionState(Compaction* c) + : compaction(c), + outfile(NULL), + builder(NULL), + total_bytes(0) { + } +}; + +namespace { +class NullWritableFile : public WritableFile { + public: + virtual Status Append(const Slice& data) { return Status::OK(); } + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } +}; +} + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} +Options SanitizeOptions(const std::string& dbname, + const InternalKeyComparator* icmp, + const Options& src) { + Options result = src; + result.comparator = icmp; + ClipToRange(&result.max_open_files, 20, 50000); + ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); + if (result.info_log == NULL) { + // Open a log file in the same directory as the db + src.env->CreateDir(dbname); // In case it does not exist + src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); + Status s = src.env->NewWritableFile(InfoLogFileName(dbname), + &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = new NullWritableFile; + } + } + if (result.block_cache == NULL) { + result.block_cache = NewLRUCache(8 << 20); + } + return result; +} + +DBImpl::DBImpl(const Options& options, const std::string& dbname) + : env_(options.env), + internal_comparator_(options.comparator), + options_(SanitizeOptions(dbname, &internal_comparator_, options)), + owns_info_log_(options_.info_log != options.info_log), + owns_cache_(options_.block_cache != options.block_cache), + dbname_(dbname), + db_lock_(NULL), + shutting_down_(NULL), + bg_cv_(&mutex_), + compacting_cv_(&mutex_), + mem_(new MemTable(internal_comparator_)), + imm_(NULL), + logfile_(NULL), + log_(NULL), + bg_compaction_scheduled_(false), + compacting_(false) { + has_imm_.Release_Store(NULL); + + // Reserve ten files or so for other uses and give the rest to TableCache. + const int table_cache_size = options.max_open_files - 10; + table_cache_ = new TableCache(dbname_, &options_, table_cache_size); + + versions_ = new VersionSet(dbname_, &options_, table_cache_, + &internal_comparator_); +} + +DBImpl::~DBImpl() { + // Wait for background work to finish + mutex_.Lock(); + shutting_down_.Release_Store(this); // Any non-NULL value is ok + if (bg_compaction_scheduled_) { + while (bg_compaction_scheduled_) { + bg_cv_.Wait(); + } + } + mutex_.Unlock(); + + if (db_lock_ != NULL) { + env_->UnlockFile(db_lock_); + } + + delete versions_; + delete mem_; + delete imm_; + delete log_; + delete logfile_; + delete table_cache_; + + if (owns_info_log_) { + delete options_.info_log; + } + if (owns_cache_) { + delete options_.block_cache; + } +} + +Status DBImpl::NewDB() { + VersionEdit new_db; + new_db.SetComparatorName(user_comparator()->Name()); + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + WritableFile* file; + Status s = env_->NewWritableFile(manifest, &file); + if (!s.ok()) { + return s; + } + { + log::Writer log(file); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = file->Close(); + } + } + delete file; + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(manifest); + } + return s; +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || options_.paranoid_checks) { + // No change needed + } else { + Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str()); + *s = Status::OK(); + } +} + +void DBImpl::DeleteObsoleteFiles() { + // Make a set of all of the live files + std::set live = pending_outputs_; + versions_->AddLiveFiles(&live); + + std::vector filenames; + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number == versions_->LogNumber()) || + (number == versions_->PrevLogNumber())); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (in case there is a race that allows other incarnations) + keep = (number >= versions_->ManifestFileNumber()); + break; + case kTableFile: + keep = (live.find(number) != live.end()); + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live" + keep = (live.find(number) != live.end()); + break; + case kCurrentFile: + case kDBLockFile: + case kInfoLogFile: + keep = true; + break; + } + + if (!keep) { + if (type == kTableFile) { + table_cache_->Evict(number); + } + Log(env_, options_.info_log, "Delete type=%d #%lld\n", + int(type), + static_cast(number)); + env_->DeleteFile(dbname_ + "/" + filenames[i]); + } + } + } +} + +Status DBImpl::Recover(VersionEdit* edit) { + mutex_.AssertHeld(); + + // Ignore error from CreateDir since the creation of the DB is + // committed only when the descriptor is created, and this directory + // may already exist from a previous failed creation attempt. + env_->CreateDir(dbname_); + assert(db_lock_ == NULL); + Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + if (!env_->FileExists(CurrentFileName(dbname_))) { + if (options_.create_if_missing) { + s = NewDB(); + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + dbname_, "does not exist (create_if_missing is false)"); + } + } else { + if (options_.error_if_exists) { + return Status::InvalidArgument( + dbname_, "exists (error_if_exists is true)"); + } + } + + s = versions_->Recover(); + if (s.ok()) { + // Recover from the log files named in the descriptor + SequenceNumber max_sequence(0); + if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log + s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence); + } + if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state + s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence); + } + if (s.ok()) { + if (versions_->LastSequence() < max_sequence) { + versions_->SetLastSequence(max_sequence); + } + } + } + + return s; +} + +Status DBImpl::RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + WritableFile* info_log; + const char* fname; + Status* status; // NULL if options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) { + Log(env, info_log, "%s%s: dropping %d bytes; %s", + (this->status == NULL ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != NULL && this->status->ok()) *this->status = s; + } + }; + + mutex_.AssertHeld(); + + // Open the log file + std::string fname = LogFileName(dbname_, log_number); + SequentialFile* file; + Status status = env_->NewSequentialFile(fname, &file); + if (!status.ok()) { + MaybeIgnoreError(&status); + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks ? &status : NULL); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(file, &reporter, true/*checksum*/); + Log(env_, options_.info_log, "Recovering log #%llu", + (unsigned long long) log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable* mem = NULL; + while (reader.ReadRecord(&record, &scratch) && + status.ok()) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + + if (mem == NULL) { + mem = new MemTable(internal_comparator_); + } + status = WriteBatchInternal::InsertInto(&batch, mem); + MaybeIgnoreError(&status); + if (!status.ok()) { + break; + } + const SequenceNumber last_seq = + WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { + status = WriteLevel0Table(mem, edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + break; + } + delete mem; + mem = NULL; + } + } + + if (status.ok() && mem != NULL) { + status = WriteLevel0Table(mem, edit); + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + } + + delete mem; + delete file; + return status; +} + +Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + pending_outputs_.insert(meta.number); + Iterator* iter = mem->NewIterator(); + Log(env_, options_.info_log, "Level-0 table #%llu: started", + (unsigned long long) meta.number); + + Status s; + { + mutex_.Unlock(); + s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit); + mutex_.Lock(); + } + + Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s", + (unsigned long long) meta.number, + (unsigned long long) meta.file_size, + s.ToString().c_str()); + delete iter; + pending_outputs_.erase(meta.number); + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + stats_[0].Add(stats); + return s; +} + +Status DBImpl::CompactMemTable() { + mutex_.AssertHeld(); + assert(imm_ != NULL); + assert(compacting_); + + // Save the contents of the memtable as a new Table + VersionEdit edit; + Status s = WriteLevel0Table(imm_, &edit); + + // Replace immutable memtable with the generated Table + if (s.ok()) { + edit.SetPrevLogNumber(0); + s = versions_->LogAndApply(&edit, imm_); + } + + if (s.ok()) { + // Commit to the new state + imm_ = NULL; + has_imm_.Release_Store(NULL); + DeleteObsoleteFiles(); + } + + compacting_cv_.SignalAll(); // Wake up waiter even if there was an error + return s; +} + +void DBImpl::TEST_CompactRange( + int level, + const std::string& begin, + const std::string& end) { + MutexLock l(&mutex_); + while (compacting_) { + compacting_cv_.Wait(); + } + Compaction* c = versions_->CompactRange( + level, + InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek), + InternalKey(end, 0, static_cast(0))); + + if (c != NULL) { + CompactionState* compact = new CompactionState(c); + DoCompactionWork(compact); // Ignore error in test compaction + CleanupCompaction(compact); + } + + // Start any background compaction that may have been delayed by this thread + MaybeScheduleCompaction(); +} + +Status DBImpl::TEST_CompactMemTable() { + MutexLock l(&mutex_); + Status s = MakeRoomForWrite(true /* force compaction */); + if (s.ok()) { + // Wait until the compaction completes + while (imm_ != NULL && bg_error_.ok()) { + compacting_cv_.Wait(); + } + if (imm_ != NULL) { + s = bg_error_; + } + } + return s; +} + +void DBImpl::MaybeScheduleCompaction() { + mutex_.AssertHeld(); + if (bg_compaction_scheduled_) { + // Already scheduled + } else if (compacting_) { + // Some other thread is running a compaction. Do not conflict with it. + } else if (shutting_down_.Acquire_Load()) { + // DB is being deleted; no more background compactions + } else if (imm_ == NULL && !versions_->NeedsCompaction()) { + // No work to be done + } else { + bg_compaction_scheduled_ = true; + env_->Schedule(&DBImpl::BGWork, this); + } +} + +void DBImpl::BGWork(void* db) { + reinterpret_cast(db)->BackgroundCall(); +} + +void DBImpl::BackgroundCall() { + MutexLock l(&mutex_); + assert(bg_compaction_scheduled_); + if (!shutting_down_.Acquire_Load() && + !compacting_) { + BackgroundCompaction(); + } + bg_compaction_scheduled_ = false; + bg_cv_.SignalAll(); + + // Previous compaction may have produced too many files in a level, + // so reschedule another compaction if needed. + MaybeScheduleCompaction(); +} + +void DBImpl::BackgroundCompaction() { + mutex_.AssertHeld(); + assert(!compacting_); + + if (imm_ != NULL) { + compacting_ = true; + CompactMemTable(); + compacting_ = false; + compacting_cv_.SignalAll(); + return; + } + + Compaction* c = versions_->PickCompaction(); + if (c == NULL) { + // Nothing to do + return; + } + + Status status; + if (c->IsTrivialMove()) { + // Move file to next level + assert(c->num_input_files(0) == 1); + FileMetaData* f = c->input(0, 0); + c->edit()->DeleteFile(c->level(), f->number); + c->edit()->AddFile(c->level() + 1, f->number, f->file_size, + f->smallest, f->largest); + status = versions_->LogAndApply(c->edit(), NULL); + Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", + static_cast(f->number), + c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str()); + } else { + CompactionState* compact = new CompactionState(c); + status = DoCompactionWork(compact); + CleanupCompaction(compact); + } + delete c; + + if (status.ok()) { + // Done + } else if (shutting_down_.Acquire_Load()) { + // Ignore compaction errors found during shutting down + } else { + Log(env_, options_.info_log, + "Compaction error: %s", status.ToString().c_str()); + if (options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } +} + +void DBImpl::CleanupCompaction(CompactionState* compact) { + mutex_.AssertHeld(); + if (compact->builder != NULL) { + // May happen if we get a shutdown call in the middle of compaction + compact->builder->Abandon(); + delete compact->builder; + } else { + assert(compact->outfile == NULL); + } + delete compact->outfile; + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + pending_outputs_.erase(out.number); + } + delete compact; +} + +Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { + assert(compact != NULL); + assert(compact->builder == NULL); + uint64_t file_number; + { + mutex_.Lock(); + file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + CompactionState::Output out; + out.number = file_number; + out.smallest.Clear(); + out.largest.Clear(); + compact->outputs.push_back(out); + mutex_.Unlock(); + } + + // Make the output file + std::string fname = TableFileName(dbname_, file_number); + Status s = env_->NewWritableFile(fname, &compact->outfile); + if (s.ok()) { + compact->builder = new TableBuilder(options_, compact->outfile); + } + return s; +} + +Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, + Iterator* input) { + assert(compact != NULL); + assert(compact->outfile != NULL); + assert(compact->builder != NULL); + + const uint64_t output_number = compact->current_output()->number; + assert(output_number != 0); + + // Check for iterator errors + Status s = input->status(); + const uint64_t current_entries = compact->builder->NumEntries(); + if (s.ok()) { + s = compact->builder->Finish(); + } else { + compact->builder->Abandon(); + } + const uint64_t current_bytes = compact->builder->FileSize(); + compact->current_output()->file_size = current_bytes; + compact->total_bytes += current_bytes; + delete compact->builder; + compact->builder = NULL; + + // Finish and check for file errors + if (s.ok()) { + s = compact->outfile->Sync(); + } + if (s.ok()) { + s = compact->outfile->Close(); + } + delete compact->outfile; + compact->outfile = NULL; + + if (s.ok() && current_entries > 0) { + // Verify that the table is usable + Iterator* iter = table_cache_->NewIterator(ReadOptions(), + output_number, + current_bytes); + s = iter->status(); + delete iter; + if (s.ok()) { + Log(env_, options_.info_log, + "Generated table #%llu: %lld keys, %lld bytes", + (unsigned long long) output_number, + (unsigned long long) current_entries, + (unsigned long long) current_bytes); + } + } + return s; +} + + +Status DBImpl::InstallCompactionResults(CompactionState* compact) { + mutex_.AssertHeld(); + Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1, + static_cast(compact->total_bytes)); + + // Add compaction outputs + compact->compaction->AddInputDeletions(compact->compaction->edit()); + const int level = compact->compaction->level(); + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + compact->compaction->edit()->AddFile( + level + 1, + out.number, out.file_size, out.smallest, out.largest); + pending_outputs_.erase(out.number); + } + compact->outputs.clear(); + + Status s = versions_->LogAndApply(compact->compaction->edit(), NULL); + if (s.ok()) { + compact->compaction->ReleaseInputs(); + DeleteObsoleteFiles(); + } else { + // Discard any files we may have created during this failed compaction + for (size_t i = 0; i < compact->outputs.size(); i++) { + env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); + } + } + return s; +} + +Status DBImpl::DoCompactionWork(CompactionState* compact) { + const uint64_t start_micros = env_->NowMicros(); + int64_t imm_micros = 0; // Micros spent doing imm_ compactions + + Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1); + + assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); + assert(compact->builder == NULL); + assert(compact->outfile == NULL); + if (snapshots_.empty()) { + compact->smallest_snapshot = versions_->LastSequence(); + } else { + compact->smallest_snapshot = snapshots_.oldest()->number_; + } + + // Release mutex while we're actually doing the compaction work + compacting_ = true; + mutex_.Unlock(); + + Iterator* input = versions_->MakeInputIterator(compact->compaction); + input->SeekToFirst(); + Status status; + ParsedInternalKey ikey; + std::string current_user_key; + bool has_current_user_key = false; + SequenceNumber last_sequence_for_key = kMaxSequenceNumber; + for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { + // Prioritize immutable compaction work + if (has_imm_.NoBarrier_Load() != NULL) { + const uint64_t imm_start = env_->NowMicros(); + mutex_.Lock(); + if (imm_ != NULL) { + CompactMemTable(); + compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + } + mutex_.Unlock(); + imm_micros += (env_->NowMicros() - imm_start); + } + + Slice key = input->key(); + InternalKey tmp_internal_key; + tmp_internal_key.DecodeFrom(key); + if (compact->compaction->ShouldStopBefore(tmp_internal_key) && + compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. + bool drop = false; + if (!ParseInternalKey(key, &ikey)) { + // Do not hide error keys + current_user_key.clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + } else { + if (!has_current_user_key || + user_comparator()->Compare(ikey.user_key, + Slice(current_user_key)) != 0) { + // First occurrence of this user key + current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + } + + if (last_sequence_for_key <= compact->smallest_snapshot) { + // Hidden by an newer entry for same user key + drop = true; // (A) + } else if (ikey.type == kTypeDeletion && + ikey.sequence <= compact->smallest_snapshot && + compact->compaction->IsBaseLevelForKey(ikey.user_key)) { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + } + + last_sequence_for_key = ikey.sequence; + } +#if 0 + Log(env_, options_.info_log, + " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " + "%d smallest_snapshot: %d", + ikey.user_key.ToString().c_str(), + (int)ikey.sequence, ikey.type, kTypeValue, drop, + compact->compaction->IsBaseLevelForKey(ikey.user_key), + (int)last_sequence_for_key, (int)compact->smallest_snapshot); +#endif + + if (!drop) { + // Open output file if necessary + if (compact->builder == NULL) { + status = OpenCompactionOutputFile(compact); + if (!status.ok()) { + break; + } + } + if (compact->builder->NumEntries() == 0) { + compact->current_output()->smallest.DecodeFrom(key); + } + compact->current_output()->largest.DecodeFrom(key); + compact->builder->Add(key, input->value()); + + // Close output file if it is big enough + if (compact->builder->FileSize() >= + compact->compaction->MaxOutputFileSize()) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + } + + input->Next(); + } + + if (status.ok() && shutting_down_.Acquire_Load()) { + status = Status::IOError("Deleting DB during compaction"); + } + if (status.ok() && compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + } + if (status.ok()) { + status = input->status(); + } + delete input; + input = NULL; + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros - imm_micros; + for (int which = 0; which < 2; which++) { + for (int i = 0; i < compact->compaction->num_input_files(which); i++) { + stats.bytes_read += compact->compaction->input(which, i)->file_size; + } + } + for (size_t i = 0; i < compact->outputs.size(); i++) { + stats.bytes_written += compact->outputs[i].file_size; + } + + mutex_.Lock(); + stats_[compact->compaction->level() + 1].Add(stats); + + if (status.ok()) { + status = InstallCompactionResults(compact); + } + compacting_ = false; + compacting_cv_.SignalAll(); + return status; +} + +Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, + SequenceNumber* latest_snapshot) { + mutex_.Lock(); + *latest_snapshot = versions_->LastSequence(); + + // Collect together all needed child iterators + std::vector list; + list.push_back(mem_->NewIterator()); + if (imm_ != NULL) { + list.push_back(imm_->NewIterator()); + } + versions_->current()->AddIterators(options, &list); + Iterator* internal_iter = + NewMergingIterator(&internal_comparator_, &list[0], list.size()); + versions_->current()->Ref(); + internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current()); + + mutex_.Unlock(); + return internal_iter; +} + +Iterator* DBImpl::TEST_NewInternalIterator() { + SequenceNumber ignored; + return NewInternalIterator(ReadOptions(), &ignored); +} + +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { + MutexLock l(&mutex_); + return versions_->MaxNextLevelOverlappingBytes(); +} + +Status DBImpl::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + // TODO(opt): faster implementation + Iterator* iter = NewIterator(options); + iter->Seek(key); + bool found = false; + if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) { + Slice v = iter->value(); + value->assign(v.data(), v.size()); + found = true; + } + // Non-OK iterator status trumps everything else + Status result = iter->status(); + if (result.ok() && !found) { + result = Status::NotFound(Slice()); // Use an empty error message for speed + } + delete iter; + return result; +} + +Iterator* DBImpl::NewIterator(const ReadOptions& options) { + SequenceNumber latest_snapshot; + Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + SequenceNumber sequence = + (options.snapshot ? options.snapshot->number_ : latest_snapshot); + return NewDBIterator(&dbname_, env_, + user_comparator(), internal_iter, sequence); +} + +void DBImpl::Unref(void* arg1, void* arg2) { + DBImpl* impl = reinterpret_cast(arg1); + Version* v = reinterpret_cast(arg2); + MutexLock l(&impl->mutex_); + v->Unref(); +} + +const Snapshot* DBImpl::GetSnapshot() { + MutexLock l(&mutex_); + return snapshots_.New(versions_->LastSequence()); +} + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + MutexLock l(&mutex_); + snapshots_.Delete(s); +} + +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { + return DB::Put(o, key, val); +} + +Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + return DB::Delete(options, key); +} + +Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { + Status status; + MutexLock l(&mutex_); + status = MakeRoomForWrite(false); // May temporarily release lock and wait + uint64_t last_sequence = versions_->LastSequence(); + if (status.ok()) { + WriteBatchInternal::SetSequence(updates, last_sequence + 1); + last_sequence += WriteBatchInternal::Count(updates); + versions_->SetLastSequence(last_sequence); + + // Add to log and apply to memtable + status = log_->AddRecord(WriteBatchInternal::Contents(updates)); + if (status.ok() && options.sync) { + status = logfile_->Sync(); + } + if (status.ok()) { + status = WriteBatchInternal::InsertInto(updates, mem_); + } + } + if (options.post_write_snapshot != NULL) { + *options.post_write_snapshot = + status.ok() ? snapshots_.New(last_sequence) : NULL; + } + return status; +} + +Status DBImpl::MakeRoomForWrite(bool force) { + mutex_.AssertHeld(); + Status s; + while (true) { + if (!bg_error_.ok()) { + // Yield previous error + s = bg_error_; + break; + } else if (!force && + (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { + // There is room in current memtable + break; + } else if (imm_ != NULL) { + // We have filled up the current memtable, but the previous + // one is still being compacted, so we wait. + compacting_cv_.Wait(); + } else { + // Attempt to switch to a new memtable and trigger compaction of old + assert(versions_->PrevLogNumber() == 0); + uint64_t new_log_number = versions_->NewFileNumber(); + WritableFile* lfile = NULL; + s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); + if (!s.ok()) { + break; + } + VersionEdit edit; + edit.SetPrevLogNumber(versions_->LogNumber()); + edit.SetLogNumber(new_log_number); + s = versions_->LogAndApply(&edit, NULL); + if (!s.ok()) { + delete lfile; + env_->DeleteFile(LogFileName(dbname_, new_log_number)); + break; + } + delete log_; + delete logfile_; + logfile_ = lfile; + log_ = new log::Writer(lfile); + imm_ = mem_; + has_imm_.Release_Store(imm_); + mem_ = new MemTable(internal_comparator_); + force = false; // Do not force another compaction if have room + MaybeScheduleCompaction(); + } + } + return s; +} + +bool DBImpl::GetProperty(const Slice& property, std::string* value) { + value->clear(); + + MutexLock l(&mutex_); + Slice in = property; + Slice prefix("leveldb."); + if (!in.starts_with(prefix)) return false; + in.remove_prefix(prefix.size()); + + if (in.starts_with("num-files-at-level")) { + in.remove_prefix(strlen("num-files-at-level")); + uint64_t level; + bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); + if (!ok || level < 0 || level >= config::kNumLevels) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + versions_->NumLevelFiles(static_cast(level))); + *value = buf; + return true; + } + } else if (in == "stats") { + char buf[200]; + snprintf(buf, sizeof(buf), + " Compactions\n" + "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n" + "--------------------------------------------------\n" + ); + value->append(buf); + for (int level = 0; level < config::kNumLevels; level++) { + int files = versions_->NumLevelFiles(level); + if (stats_[level].micros > 0 || files > 0) { + snprintf( + buf, sizeof(buf), + "%3d %8d %8.0f %9.0f %8.0f %9.0f\n", + level, + files, + versions_->NumLevelBytes(level) / 1048576.0, + stats_[level].micros / 1e6, + stats_[level].bytes_read / 1048576.0, + stats_[level].bytes_written / 1048576.0); + value->append(buf); + } + } + return true; + } + + return false; +} + +void DBImpl::GetApproximateSizes( + const Range* range, int n, + uint64_t* sizes) { + // TODO(opt): better implementation + Version* v; + { + MutexLock l(&mutex_); + versions_->current()->Ref(); + v = versions_->current(); + } + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + uint64_t start = versions_->ApproximateOffsetOf(v, k1); + uint64_t limit = versions_->ApproximateOffsetOf(v, k2); + sizes[i] = (limit >= start ? limit - start : 0); + } + + { + MutexLock l(&mutex_); + v->Unref(); + } +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { + WriteBatch batch; + batch.Put(key, value); + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, const Slice& key) { + WriteBatch batch; + batch.Delete(key); + return Write(opt, &batch); +} + +DB::~DB() { } + +Status DB::Open(const Options& options, const std::string& dbname, + DB** dbptr) { + *dbptr = NULL; + + DBImpl* impl = new DBImpl(options, dbname); + impl->mutex_.Lock(); + VersionEdit edit; + Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + WritableFile* lfile; + s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), + &lfile); + if (s.ok()) { + edit.SetLogNumber(new_log_number); + impl->logfile_ = lfile; + impl->log_ = new log::Writer(lfile); + s = impl->versions_->LogAndApply(&edit, NULL); + } + if (s.ok()) { + impl->DeleteObsoleteFiles(); + } + } + impl->mutex_.Unlock(); + if (s.ok()) { + *dbptr = impl; + } else { + delete impl; + } + return s; +} + +Status DestroyDB(const std::string& dbname, const Options& options) { + Env* env = options.env; + std::vector filenames; + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + if (filenames.empty()) { + return Status::OK(); + } + + FileLock* lock; + Status result = env->LockFile(LockFileName(dbname), &lock); + if (result.ok()) { + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + Status del = env->DeleteFile(dbname + "/" + filenames[i]); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(LockFileName(dbname)); + env->DeleteDir(dbname); // Ignore error in case dir contains other files + } + return result; +} + +} diff --git a/leveldb/db/db_impl.h b/leveldb/db/db_impl.h new file mode 100644 index 0000000..7699d8c --- /dev/null +++ b/leveldb/db/db_impl.h @@ -0,0 +1,184 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ +#define STORAGE_LEVELDB_DB_DB_IMPL_H_ + +#include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "port/port.h" + +namespace leveldb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +class DBImpl : public DB { + public: + DBImpl(const Options& options, const std::string& dbname); + virtual ~DBImpl(); + + // Implementations of the DB interface + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); + virtual Status Delete(const WriteOptions&, const Slice& key); + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value); + virtual Iterator* NewIterator(const ReadOptions&); + virtual const Snapshot* GetSnapshot(); + virtual void ReleaseSnapshot(const Snapshot* snapshot); + virtual bool GetProperty(const Slice& property, std::string* value); + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); + + // Extra methods (for testing) that are not in the public DB interface + + // Compact any files in the named level that overlap [begin,end] + void TEST_CompactRange( + int level, + const std::string& begin, + const std::string& end); + + // Force current memtable contents to be compacted. + Status TEST_CompactMemTable(); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + Iterator* TEST_NewInternalIterator(); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes(); + + private: + friend class DB; + + Iterator* NewInternalIterator(const ReadOptions&, + SequenceNumber* latest_snapshot); + + Status NewDB(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + Status Recover(VersionEdit* edit); + + void MaybeIgnoreError(Status* s) const; + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + + // Called when an iterator over a particular version of the + // descriptor goes away. + static void Unref(void* arg1, void* arg2); + + // Compact the in-memory write buffer to disk. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. + Status CompactMemTable(); + + Status RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence); + + Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); + + Status MakeRoomForWrite(bool force /* compact even if there is room? */); + + struct CompactionState; + + void MaybeScheduleCompaction(); + static void BGWork(void* db); + void BackgroundCall(); + void BackgroundCompaction(); + void CleanupCompaction(CompactionState* compact); + Status DoCompactionWork(CompactionState* compact); + + Status OpenCompactionOutputFile(CompactionState* compact); + Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); + Status InstallCompactionResults(CompactionState* compact); + + // Constant after construction + Env* const env_; + const InternalKeyComparator internal_comparator_; + const Options options_; // options_.comparator == &internal_comparator_ + bool owns_info_log_; + bool owns_cache_; + const std::string dbname_; + + // table_cache_ provides its own synchronization + TableCache* table_cache_; + + // Lock over the persistent DB state. Non-NULL iff successfully acquired. + FileLock* db_lock_; + + // State below is protected by mutex_ + port::Mutex mutex_; + port::AtomicPointer shutting_down_; + port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ + port::CondVar compacting_cv_; // Signalled when !compacting_ + MemTable* mem_; + MemTable* imm_; // Memtable being compacted + port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ + WritableFile* logfile_; + log::Writer* log_; + SnapshotList snapshots_; + + // Set of table files to protect from deletion because they are + // part of ongoing compactions. + std::set pending_outputs_; + + // Has a background compaction been scheduled or is running? + bool bg_compaction_scheduled_; + + // Is there a compaction running? + bool compacting_; + + VersionSet* versions_; + + // Have we encountered a background error in paranoid mode? + Status bg_error_; + + // Per level compaction stats. stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + int64_t micros; + int64_t bytes_read; + int64_t bytes_written; + + CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { } + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->bytes_read += c.bytes_read; + this->bytes_written += c.bytes_written; + } + }; + CompactionStats stats_[config::kNumLevels]; + + // No copying allowed + DBImpl(const DBImpl&); + void operator=(const DBImpl&); + + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } +}; + +// Sanitize db options. The caller should delete result.info_log if +// it is not equal to src.info_log. +extern Options SanitizeOptions(const std::string& db, + const InternalKeyComparator* icmp, + const Options& src); + +} + +#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/leveldb/db/db_iter.cc b/leveldb/db/db_iter.cc new file mode 100644 index 0000000..0be18ff --- /dev/null +++ b/leveldb/db/db_iter.cc @@ -0,0 +1,298 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/mutexlock.h" + +namespace leveldb { + +#if 0 +static void DumpInternalIter(Iterator* iter) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey k; + if (!ParseInternalKey(iter->key(), &k)) { + fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); + } else { + fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); + } + } +} +#endif + +namespace { + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter: public Iterator { + public: + // Which direction is the iterator currently moving? + // (1) When moving forward, the internal iterator is positioned at + // the exact entry that yields this->key(), this->value() + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { + kForward, + kReverse + }; + + DBIter(const std::string* dbname, Env* env, + const Comparator* cmp, Iterator* iter, SequenceNumber s) + : dbname_(dbname), + env_(env), + user_comparator_(cmp), + iter_(iter), + sequence_(s), + direction_(kForward), + valid_(false) { + } + virtual ~DBIter() { + delete iter_; + } + virtual bool Valid() const { return valid_; } + virtual Slice key() const { + assert(valid_); + return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_; + } + virtual Slice value() const { + assert(valid_); + return (direction_ == kForward) ? iter_->value() : saved_value_; + } + virtual Status status() const { + if (status_.ok()) { + return iter_->status(); + } else { + return status_; + } + } + + virtual void Next(); + virtual void Prev(); + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + + private: + void FindNextUserEntry(bool skipping, std::string* skip); + void FindPrevUserEntry(); + bool ParseKey(ParsedInternalKey* key); + + inline void SaveKey(const Slice& k, std::string* dst) { + dst->assign(k.data(), k.size()); + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + const std::string* const dbname_; + Env* const env_; + const Comparator* const user_comparator_; + Iterator* const iter_; + SequenceNumber const sequence_; + + Status status_; + std::string saved_key_; // == current key when direction_==kReverse + std::string saved_value_; // == current raw value when direction_==kReverse + Direction direction_; + bool valid_; + + // No copying allowed + DBIter(const DBIter&); + void operator=(const DBIter&); +}; + +inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { + if (!ParseInternalKey(iter_->key(), ikey)) { + status_ = Status::Corruption("corrupted internal key in DBIter"); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + + if (direction_ == kReverse) { // Switch directions? + direction_ = kForward; + // iter_ is pointing just before the entries for this->key(), + // so advance into the range of entries for this->key() and then + // use the normal skipping code below. + if (!iter_->Valid()) { + iter_->SeekToFirst(); + } else { + iter_->Next(); + } + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + return; + } + } + + // Temporarily use saved_key_ as storage for key to skip. + std::string* skip = &saved_key_; + SaveKey(ExtractUserKey(iter_->key()), skip); + FindNextUserEntry(true, skip); +} + +void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { + // Loop until we hit an acceptable entry to yield + assert(iter_->Valid()); + assert(direction_ == kForward); + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + SaveKey(ikey.user_key, skip); + skipping = true; + break; + case kTypeValue: + if (skipping && + user_comparator_->Compare(ikey.user_key, *skip) <= 0) { + // Entry hidden + } else { + valid_ = true; + saved_key_.clear(); + return; + } + break; + } + } + iter_->Next(); + } while (iter_->Valid()); + saved_key_.clear(); + valid_ = false; +} + +void DBIter::Prev() { + assert(valid_); + + if (direction_ == kForward) { // Switch directions? + // iter_ is pointing at the current entry. Scan backwards until + // the key changes so we can use the normal reverse scanning code. + assert(iter_->Valid()); // Otherwise valid_ would have been false + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + while (true) { + iter_->Prev(); + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + return; + } + if (user_comparator_->Compare(ExtractUserKey(iter_->key()), + saved_key_) < 0) { + break; + } + } + direction_ = kReverse; + } + + FindPrevUserEntry(); +} + +void DBIter::FindPrevUserEntry() { + assert(direction_ == kReverse); + + ValueType value_type = kTypeDeletion; + if (iter_->Valid()) { + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if ((value_type != kTypeDeletion) && + user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { + // We encountered a non-deleted value in entries for previous keys, + break; + } + value_type = ikey.type; + if (value_type == kTypeDeletion) { + ClearSavedValue(); + } else { + Slice raw_value = iter_->value(); + if (saved_value_.capacity() > raw_value.size() + 1048576) { + std::string empty; + swap(empty, saved_value_); + } + saved_value_.assign(raw_value.data(), raw_value.size()); + } + } + iter_->Prev(); + } while (iter_->Valid()); + } + + if (value_type == kTypeDeletion) { + // End + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + direction_ = kForward; + } else { + valid_ = true; + } +} + +void DBIter::Seek(const Slice& target) { + direction_ = kForward; + ClearSavedValue(); + saved_key_.clear(); + AppendInternalKey( + &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); + iter_->Seek(saved_key_); + if (iter_->Valid()) { + FindNextUserEntry(false, &saved_key_ /* temporary storage */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToFirst() { + direction_ = kForward; + ClearSavedValue(); + iter_->SeekToFirst(); + if (iter_->Valid()) { + FindNextUserEntry(false, &saved_key_ /* temporary storage */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToLast() { + direction_ = kReverse; + ClearSavedValue(); + iter_->SeekToLast(); + FindPrevUserEntry(); +} + +} // anonymous namespace + +Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence) { + return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); +} + +} diff --git a/leveldb/db/db_iter.h b/leveldb/db/db_iter.h new file mode 100644 index 0000000..195f3d3 --- /dev/null +++ b/leveldb/db/db_iter.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ +#define STORAGE_LEVELDB_DB_DB_ITER_H_ + +#include +#include "leveldb/db.h" +#include "db/dbformat.h" + +namespace leveldb { + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified "sequence" number +// into appropriate user keys. +extern Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence); + +} + +#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/leveldb/db/db_test.cc b/leveldb/db/db_test.cc new file mode 100644 index 0000000..f828e3d --- /dev/null +++ b/leveldb/db/db_test.cc @@ -0,0 +1,1030 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +class DBTest { + public: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + DBTest() : env_(Env::Default()) { + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, Options()); + db_ = NULL; + Reopen(); + } + + ~DBTest() { + delete db_; + DestroyDB(dbname_, Options()); + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); + } + + void DestroyAndReopen(Options* options = NULL) { + delete db_; + db_ = NULL; + DestroyDB(dbname_, Options()); + ASSERT_OK(TryReopen(options)); + } + + Status TryReopen(Options* options) { + delete db_; + db_ = NULL; + Options opts; + if (options != NULL) { + opts = *options; + } else { + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const std::string& k, const std::string& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string AllEntriesFor(const Slice& user_key) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (last_options_.comparator->Compare( + ikey.user_key, user_key) != 0) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeDeletion: + result += "DEL"; + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + delete iter; + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + void Compact(const Slice& start, const Slice& limit) { + dbfull()->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + if (NumTableFilesAtLevel(level) > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbfull()->TEST_CompactRange(level, "", "~"); + } + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < config::kNumLevels; level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(DBTest, Empty) { + ASSERT_TRUE(db_ != NULL); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(DBTest, ReadWrite) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); +} + +TEST(DBTest, PutDeleteGet) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(db_->Delete(WriteOptions(), "foo")); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(DBTest, IterEmpty) { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterSingle) { + ASSERT_OK(Put("a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterMulti) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put("a", "va2")); + ASSERT_OK(Put("a2", "va3")); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Put("c", "vc2")); + ASSERT_OK(Delete("b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterSmallAndLargeMix) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", std::string(100000, 'b'))); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Put("d", std::string(100000, 'd'))); + ASSERT_OK(Put("e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, Recover) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v5", Get("baz")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + + Reopen(); + ASSERT_EQ("v3", Get("foo")); + ASSERT_OK(Put("foo", "v4")); + ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v5", Get("baz")); +} + +TEST(DBTest, RecoveryWithEmptyLog) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + Reopen(); + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); +} + +TEST(DBTest, MinorCompactionsHappen) { + Options options; + options.write_buffer_size = 10000; + Reopen(&options); + + const int N = 500; + + int starting_num_tables = NumTableFilesAtLevel(0); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = NumTableFilesAtLevel(0); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } + + Reopen(); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } +} + +TEST(DBTest, RecoverWithLargeLog) { + { + Options options; + Reopen(&options); + ASSERT_OK(Put("big1", std::string(200000, '1'))); + ASSERT_OK(Put("big2", std::string(200000, '2'))); + ASSERT_OK(Put("small3", std::string(10, '3'))); + ASSERT_OK(Put("small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + Reopen(&options); + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + ASSERT_EQ(std::string(200000, '1'), Get("big1")); + ASSERT_EQ(std::string(200000, '2'), Get("big2")); + ASSERT_EQ(std::string(10, '3'), Get("small3")); + ASSERT_EQ(std::string(10, '4'), Get("small4")); + ASSERT_GT(NumTableFilesAtLevel(0), 1); +} + +TEST(DBTest, CompactionsGenerateMultipleFiles) { + Options options; + options.write_buffer_size = 100000000; // Large write buffer + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to level-0 + Reopen(&options); + dbfull()->TEST_CompactRange(0, "", Key(100000)); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST(DBTest, SparseMerge) { + Options options; + options.compression = kNoCompression; + Reopen(&options); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put("A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + Put("C", "vc"); + Compact("", "z"); + + // Make sparse update + Put("A", "va2"); + Put("B100", "bvalue2"); + Put("C", "vc2"); + dbfull()->TEST_CompactMemTable(); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST(DBTest, ApproximateSizes) { + Options options; + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + DestroyAndReopen(); + + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + Reopen(&options); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + const int N = 80; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); + } + + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); + ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), + 100000 * (i+1), 100000 * (i+1) + 10000)); + ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), + 100000 * 10, 100000 * 10 + 10000)); + } + ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); + ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); + + dbfull()->TEST_CompactRange(0, + Key(compact_start), + Key(compact_start + 9)); + } + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + } +} + +TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + Options options; + options.compression = kNoCompression; + Reopen(); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(2), big1)); + ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(4), big1)); + ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, Key(0), Key(100)); + } +} + +TEST(DBTest, IteratorPinsRef) { + Put("foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Write to force compactions + Put("foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values + } + Put("foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; +} + +TEST(DBTest, Snapshot) { + Put("foo", "v1"); + const Snapshot* s1 = db_->GetSnapshot(); + Put("foo", "v2"); + const Snapshot* s2 = db_->GetSnapshot(); + Put("foo", "v3"); + const Snapshot* s3 = db_->GetSnapshot(); + + Put("foo", "v4"); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v3", Get("foo", s3)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s3); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ("v4", Get("foo")); +} + +TEST(DBTest, HiddenValuesAreRemoved) { + Random rnd(301); + std::string big = RandomString(&rnd, 50000); + Put("foo", big); + Put("pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "tiny"); + Put("pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + + ASSERT_EQ(big, Get("foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); + dbfull()->TEST_CompactRange(0, "", "x"); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GE(NumTableFilesAtLevel(1), 1); + dbfull()->TEST_CompactRange(1, "", "x"); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + + ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); +} + +TEST(DBTest, DeletionMarkers1) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + Delete("foo"); + Put("foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + dbfull()->TEST_CompactRange(0, "", "z"); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(1, "", "z"); + // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. + // (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); +} + +TEST(DBTest, DeletionMarkers2) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(0, "", "z"); + // DEL kept: L2 file overlaps + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(1, "", "z"); + // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. + // (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); +} + +TEST(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "leveldb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + NewComparator cmp; + Options new_options; + new_options.comparator = &cmp; + Status s = TryReopen(&new_options); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); +} + +TEST(DBTest, DBOpen_Options) { + std::string dbname = test::TmpDir() + "/db_options_test"; + DestroyDB(dbname, Options()); + + // Does not exist, and create_if_missing == false: error + DB* db = NULL; + Options opts; + opts.create_if_missing = false; + Status s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); + ASSERT_TRUE(db == NULL); + + // Does not exist, and create_if_missing == true: OK + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; + + // Does exist, and error_if_exists == true: error + opts.create_if_missing = false; + opts.error_if_exists = true; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); + ASSERT_TRUE(db == NULL); + + // Does exist, and error_if_exists == false: OK + opts.create_if_missing = true; + opts.error_if_exists = false; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; +} + +class ModelDB: public DB { + public: + explicit ModelDB(const Options& options): options_(options) { } + ~ModelDB() { } + virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { + return DB::Put(o, k, v); + } + virtual Status Delete(const WriteOptions& o, const Slice& key) { + return DB::Delete(o, key); + } + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) { + assert(false); // Not implemented + return Status::NotFound(key); + } + virtual Iterator* NewIterator(const ReadOptions& options) { + if (options.snapshot == NULL) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + reinterpret_cast(options.snapshot->number_); + return new ModelIter(snapshot_state, false); + } + } + virtual const Snapshot* GetSnapshot() { + KVMap* saved = new KVMap; + *saved = map_; + return snapshots_.New( + reinterpret_cast(saved)); + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) { + const KVMap* saved = reinterpret_cast(snapshot->number_); + delete saved; + snapshots_.Delete(snapshot); + } + virtual Status Write(const WriteOptions& options, WriteBatch* batch) { + assert(options.post_write_snapshot == NULL); // Not supported + for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeValue: + map_[it.key().ToString()] = it.value().ToString(); + break; + case kTypeDeletion: + map_.erase(it.key().ToString()); + break; + } + } + return Status::OK(); + } + + virtual bool GetProperty(const Slice& property, std::string* value) { + return false; + } + virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + private: + typedef std::map KVMap; + class ModelIter: public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) { + } + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() { ++iter_; } + virtual void Prev() { --iter_; } + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + SnapshotList snapshots_; +}; + +static std::string RandomKey(Random* rnd) { + int len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, + DB* model, + DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); + miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); + delete miter; + delete dbiter; + return ok; +} + +TEST(DBTest, Randomized) { + Random rnd(test::RandomSeed()); + ModelDB model(last_options_); + const int N = 10000; + const Snapshot* model_snap = NULL; + const Snapshot* db_snap = NULL; + std::string k, v; + for (int step = 0; step < N; step++) { + if (step % 100 == 0) { + fprintf(stderr, "Step %d of %d\n", step, N); + } + int p = rnd.Uniform(100); + if (p < 45) { // Put + k = RandomKey(&rnd); + v = RandomString(&rnd, + rnd.OneIn(20) + ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + + } else if (p < 90) { // Delete + k = RandomKey(&rnd); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + + + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); + + Reopen(); + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/db/dbformat.cc b/leveldb/db/dbformat.cc new file mode 100644 index 0000000..c12c138 --- /dev/null +++ b/leveldb/db/dbformat.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "db/dbformat.h" +#include "port/port.h" +#include "util/coding.h" + +namespace leveldb { + +static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(t <= kValueTypeForSeek); + return (seq << 8) | t; +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +std::string ParsedInternalKey::DebugString() const { + char buf[50]; + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); + std::string result = "'"; + result += user_key.ToString(); + result += buf; + return result; +} + +const char* InternalKeyComparator::Name() const { + return "leveldb.InternalKeyComparator"; +} + +int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +void InternalKeyComparator::FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + user_comparator_->FindShortestSeparator(&tmp, user_limit); + if (user_comparator_->Compare(*start, tmp) < 0) { + // User key has become larger. Tack on the earliest possible + // number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*start, tmp) < 0); + assert(this->Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void InternalKeyComparator::FindShortSuccessor(std::string* key) const { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + user_comparator_->FindShortSuccessor(&tmp); + if (user_comparator_->Compare(user_key, tmp) < 0) { + // User key has become larger. Tack on the earliest possible + // number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +} diff --git a/leveldb/db/dbformat.h b/leveldb/db/dbformat.h new file mode 100644 index 0000000..d583665 --- /dev/null +++ b/leveldb/db/dbformat.h @@ -0,0 +1,155 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ +#define STORAGE_LEVELDB_DB_FORMAT_H_ + +#include +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/slice.h" +#include "leveldb/table_builder.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +// Grouping of constants. We may want to make some of these +// parameters set via options. +namespace config { +static const int kNumLevels = 7; +} + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +enum ValueType { + kTypeDeletion = 0x0, + kTypeValue = 0x1, +}; +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeValue; + +typedef uint64_t SequenceNumber; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString() const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast(kTypeValue)); +} + +} + +#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/leveldb/db/dbformat_test.cc b/leveldb/db/dbformat_test.cc new file mode 100644 index 0000000..57c5578 --- /dev/null +++ b/leveldb/db/dbformat_test.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +static std::string IKey(const std::string& user_key, + uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); + return result; +} + +static void TestKey(const std::string& key, + uint64_t seq, + ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); +} + +class FormatTest { }; + +TEST(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; + const uint64_t seq[] = { + 1, 2, 3, + (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, + (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, + (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 + }; + for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeDeletion))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), + IKey("hello", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ(IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), + IKey("foo", 200, kTypeValue))); +} + +TEST(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/db/filename.cc b/leveldb/db/filename.cc new file mode 100644 index 0000000..b3a917c --- /dev/null +++ b/leveldb/db/filename.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "db/filename.h" +#include "db/dbformat.h" +#include "leveldb/env.h" +#include "util/logging.h" + +namespace leveldb { + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast(number), + suffix); + return name + buf; +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string TableFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "sst"); +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + static_cast(number)); + return dbname + buf; +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/CURRENT"; +} + +std::string LockFileName(const std::string& dbname) { + return dbname + "/LOCK"; +} + +std::string TempFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + return MakeFileName(dbname, number, "dbtmp"); +} + +std::string InfoLogFileName(const std::string& dbname) { + return dbname + "/LOG"; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname) { + return dbname + "/LOG.old"; +} + + +// Owned filenames have the form: +// dbname/CURRENT +// dbname/LOCK +// dbname/LOG +// dbname/LOG.old +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9]+.(log|sst) +bool ParseFileName(const std::string& fname, + uint64_t* number, + FileType* type) { + Slice rest(fname); + if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (rest == "LOG" || rest == "LOG.old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + Slice suffix = rest; + if (suffix == Slice(".log")) { + *type = kLogFile; + } else if (suffix == Slice(".sst")) { + *type = kTableFile; + } else if (suffix == Slice(".dbtmp")) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); + if (s.ok()) { + s = env->RenameFile(tmp, CurrentFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +} diff --git a/leveldb/db/filename.h b/leveldb/db/filename.h new file mode 100644 index 0000000..6a99744 --- /dev/null +++ b/leveldb/db/filename.h @@ -0,0 +1,80 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ +#define STORAGE_LEVELDB_DB_FILENAME_H_ + +#include +#include +#include "leveldb/slice.h" +#include "leveldb/status.h" +#include "port/port.h" + +namespace leveldb { + +class Env; + +enum FileType { + kLogFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one +}; + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::string& dbname, uint64_t number); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname); + +// If filename is a leveldb file, store the type of the file in *type. +// The number encoded in the filename is stored in *number. If the +// filename was successfully parsed, returns true. Else return false. +extern bool ParseFileName(const std::string& filename, + uint64_t* number, + FileType* type); + +// Make the CURRENT file point to the descriptor file with the +// specified number. +extern Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number); + + +} + +#endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/leveldb/db/filename_test.cc b/leveldb/db/filename_test.cc new file mode 100644 index 0000000..2f61e8d --- /dev/null +++ b/leveldb/db/filename_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +class FileNameTest { }; + +TEST(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + FileType type; + } cases[] = { + { "100.log", 100, kLogFile }, + { "0.log", 0, kLogFile }, + { "0.sst", 0, kTableFile }, + { "CURRENT", 0, kCurrentFile }, + { "LOCK", 0, kDBLockFile }, + { "MANIFEST-2", 2, kDescriptorFile }, + { "MANIFEST-7", 7, kDescriptorFile }, + { "LOG", 0, kInfoLogFile }, + { "LOG.old", 0, kInfoLogFile }, + { "18446744073709551615.log", 18446744073709551615ull, kLogFile }, + }; + for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + std::string f = cases[i].fname; + ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; + ASSERT_EQ(cases[i].type, type) << f; + ASSERT_EQ(cases[i].number, number) << f; + } + + // Errors + static const char* errors[] = { + "", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop" + }; + for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; + }; +} + +TEST(FileNameTest, Construction) { + uint64_t number; + FileType type; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(192, number); + ASSERT_EQ(kLogFile, type); + + fname = TableFileName("bar", 200); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(200, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(999, number); + ASSERT_EQ(kTempFile, type); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/db/log_format.h b/leveldb/db/log_format.h new file mode 100644 index 0000000..137cd4a --- /dev/null +++ b/leveldb/db/log_format.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ +#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ + +namespace leveldb { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4, +}; +static const int kMaxRecordType = kLastType; + +static const int kBlockSize = 32768; + +// Header is checksum (4 bytes), type (1 byte), length (2 bytes). +static const int kHeaderSize = 4 + 1 + 2; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/leveldb/db/log_reader.cc b/leveldb/db/log_reader.cc new file mode 100644 index 0000000..75e1d28 --- /dev/null +++ b/leveldb/db/log_reader.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { +namespace log { + +Reader::Reporter::~Reporter() { +} + +Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) + : file_(file), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false) { +} + +Reader::~Reader() { + delete[] backing_store_; +} + +bool Reader::ReadRecord(Slice* record, std::string* scratch) { + scratch->clear(); + record->clear(); + bool in_fragmented_record = false; + + Slice fragment; + while (true) { + switch (ReadPhysicalRecord(&fragment)) { + case kFullType: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + } + scratch->clear(); + *record = fragment; + return true; + + case kFirstType: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + } + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + if (!in_fragmented_record) { + ReportDrop(fragment.size(), "missing start of fragmented record"); + } else { + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + if (!in_fragmented_record) { + ReportDrop(fragment.size(), "missing start of fragmented record"); + } else { + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + return true; + } + break; + + case kEof: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + scratch->clear(); + } + return false; + + case kBadRecord: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: + ReportDrop( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + "unknown record type"); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + return false; +} + +void Reader::ReportDrop(size_t bytes, const char* reason) { + if (reporter_ != NULL) { + reporter_->Corruption(bytes, Status::Corruption(reason)); + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result) { + while (true) { + if (buffer_.size() < kHeaderSize) { + if (!eof_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + if (!status.ok()) { + if (reporter_ != NULL) { + reporter_->Corruption(kBlockSize, status); + } + buffer_.clear(); + eof_ = true; + return kEof; + } else if (buffer_.size() < kBlockSize) { + eof_ = true; + } + continue; + } else if (buffer_.size() == 0) { + // End of file + return kEof; + } else { + ReportDrop(buffer_.size(), "truncated record at end of file"); + buffer_.clear(); + return kEof; + } + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + if (kHeaderSize + length > buffer_.size()) { + ReportDrop(buffer_.size(), "bad record length"); + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + buffer_.clear(); + return kBadRecord; + } + + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + ReportDrop(buffer_.size(), "checksum mismatch"); + buffer_.clear(); + return kBadRecord; + } + } + + buffer_.remove_prefix(kHeaderSize + length); + *result = Slice(header + kHeaderSize, length); + return type; + } +} + +} +} diff --git a/leveldb/db/log_reader.h b/leveldb/db/log_reader.h new file mode 100644 index 0000000..baf1475 --- /dev/null +++ b/leveldb/db/log_reader.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ +#define STORAGE_LEVELDB_DB_LOG_READER_H_ + +#include "db/log_format.h" +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class SequentialFile; + +namespace log { + +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-NULL, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + Reader(SequentialFile* file, Reporter* reporter, bool checksum); + + ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + bool ReadRecord(Slice* record, std::string* scratch); + + private: + SequentialFile* const file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + kBadRecord = kMaxRecordType + 2 + }; + + // Return type, or one of the preceding special values + unsigned int ReadPhysicalRecord(Slice* result); + void ReportDrop(size_t bytes, const char* reason); + + // No copying allowed + Reader(const Reader&); + void operator=(const Reader&); +}; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/leveldb/db/log_test.cc b/leveldb/db/log_test.cc new file mode 100644 index 0000000..025a5ff --- /dev/null +++ b/leveldb/db/log_test.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +class LogTest { + private: + class StringDest : public WritableFile { + public: + std::string contents_; + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + virtual Status Append(const Slice& slice) { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + }; + + class StringSource : public SequentialFile { + public: + Slice contents_; + bool force_error_; + bool returned_partial_; + StringSource() : force_error_(false), returned_partial_(false) { } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + ASSERT_EQ(kBlockSize, n); + + if (force_error_) { + force_error_ = false; + returned_partial_ = true; + return Status::Corruption("read error"); + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + *result = Slice(contents_.data(), n); + contents_.remove_prefix(n); + return Status::OK(); + } + }; + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) { } + virtual void Corruption(size_t bytes, const Status& status) { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + StringDest dest_; + StringSource source_; + ReportCollector report_; + bool reading_; + Writer writer_; + Reader reader_; + + public: + LogTest() : reading_(false), + writer_(&dest_), + reader_(&source_, &report_, true/*checksum*/) { + } + + void Write(const std::string& msg) { + ASSERT_TRUE(!reading_) << "Write() after starting to read"; + writer_.AddRecord(Slice(msg)); + } + + size_t WrittenBytes() const { + return dest_.contents_.size(); + } + + std::string Read() { + if (!reading_) { + reading_ = true; + source_.contents_ = Slice(dest_.contents_); + } + std::string scratch; + Slice record; + if (reader_.ReadRecord(&record, &scratch)) { + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, int delta) { + dest_.contents_[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_.contents_[offset] = new_byte; + } + + void ShrinkSize(int bytes) { + dest_.contents_.resize(dest_.contents_.size() - bytes); + } + + void FixChecksum(int header_offset, int len) { + // Compute crc of type/len/data + uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_.contents_[header_offset], crc); + } + + void ForceError() { + source_.force_error_ = true; + } + + size_t DroppedBytes() const { + return report_.dropped_bytes_; + } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } +}; + +TEST(LogTest, Empty) { + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ShortTrailer) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, AlignedEof) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST(LogTest, TruncatedTrailingRecord) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); + ASSERT_EQ("OK", MatchError("truncated record at end of file")); +} + +TEST(LogTest, BadLength) { + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); +} + +TEST(LogTest, ChecksumMismatch) { + Write("foo"); + IncrementByte(0, 10); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(10, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); +} + +TEST(LogTest, UnexpectedMiddleType) { + Write("foo"); + SetByte(6, kMiddleType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedLastType) { + Write("foo"); + SetByte(6, kLastType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + const int dropped = DroppedBytes(); + ASSERT_LE(dropped, 2*kBlockSize + 100); + ASSERT_GE(dropped, 2*kBlockSize); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/db/log_writer.cc b/leveldb/db/log_writer.cc new file mode 100644 index 0000000..1696851 --- /dev/null +++ b/leveldb/db/log_writer.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { +namespace log { + +Writer::Writer(WritableFile* dest) + : dest_(dest), + block_offset_(0) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { +} + +Status Writer::AddRecord(const Slice& slice) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + Status s; + do { + const int leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < kHeaderSize) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize being 7) + assert(kHeaderSize == 7); + dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); + } + block_offset_ = 0; + } + + // Invariant: we never leave < kHeaderSize bytes in a block. + assert(kBlockSize - block_offset_ - kHeaderSize >= 0); + + const size_t avail = kBlockSize - block_offset_ - kHeaderSize; + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool begin = (ptr == slice.data()); + const bool end = (left == fragment_length); + if (begin && end) { + type = kFullType; + } else if (begin) { + type = kFirstType; + } else if (end) { + type = kLastType; + } else { + type = kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + } while (s.ok() && left > 0); + return s; +} + +Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { + assert(n <= 0xffff); // Must fit in two bytes + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + // Format the header + char buf[kHeaderSize]; + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + // Compute the crc of the record type and the payload. + uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage + EncodeFixed32(buf, crc); + + // Write the header and the payload + Status s = dest_->Append(Slice(buf, kHeaderSize)); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n)); + if (s.ok()) { + s = dest_->Flush(); + } + } + block_offset_ += kHeaderSize + n; + return s; +} + +} +} diff --git a/leveldb/db/log_writer.h b/leveldb/db/log_writer.h new file mode 100644 index 0000000..d3cf27d --- /dev/null +++ b/leveldb/db/log_writer.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ +#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ + +#include +#include "db/log_format.h" +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class WritableFile; + +namespace log { + +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(WritableFile* dest); + ~Writer(); + + Status AddRecord(const Slice& slice); + + private: + WritableFile* dest_; + int block_offset_; // Current offset in block + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + + // No copying allowed + Writer(const Writer&); + void operator=(const Writer&); +}; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/leveldb/db/memtable.cc b/leveldb/db/memtable.cc new file mode 100644 index 0000000..a3b618a --- /dev/null +++ b/leveldb/db/memtable.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" +#include "db/dbformat.h" +#include "leveldb/comparator.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "util/coding.h" + +namespace leveldb { + +static Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len; + const char* p = data; + p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted + return Slice(p, len); +} + +MemTable::MemTable(const InternalKeyComparator& cmp) + : comparator_(cmp), + table_(comparator_, &arena_) { +} + +MemTable::~MemTable() { +} + +size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } + +int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) + const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(aptr); + Slice b = GetLengthPrefixedSlice(bptr); + return comparator.Compare(a, b); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +static const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, target.size()); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator: public Iterator { + public: + explicit MemTableIterator(MemTable::Table* table) { + iter_ = new MemTable::Table::Iterator(table); + } + virtual ~MemTableIterator() { delete iter_; } + + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } + virtual Slice value() const { + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + virtual Status status() const { return Status::OK(); } + + private: + MemTable::Table::Iterator* iter_; + std::string tmp_; // For passing to EncodeKey + + // No copying allowed + MemTableIterator(const MemTableIterator&); + void operator=(const MemTableIterator&); +}; + +Iterator* MemTable::NewIterator() { + return new MemTableIterator(&table_); +} + +void MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, + const Slice& value) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + size_t key_size = key.size(); + size_t val_size = value.size(); + size_t internal_key_size = key_size + 8; + const size_t encoded_len = + VarintLength(internal_key_size) + internal_key_size + + VarintLength(val_size) + val_size; + char* buf = arena_.Allocate(encoded_len); + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + p += key_size; + EncodeFixed64(p, (s << 8) | type); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((p + val_size) - buf == encoded_len); + table_.Insert(buf); +} + +} diff --git a/leveldb/db/memtable.h b/leveldb/db/memtable.h new file mode 100644 index 0000000..45b3342 --- /dev/null +++ b/leveldb/db/memtable.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ +#define STORAGE_LEVELDB_DB_MEMTABLE_H_ + +#include +#include "leveldb/db.h" +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "util/arena.h" + +namespace leveldb { + +class InternalKeyComparator; +class Mutex; +class MemTableIterator; + +class MemTable { + public: + explicit MemTable(const InternalKeyComparator& comparator); + ~MemTable(); + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + size_t ApproximateMemoryUsage(); + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/format.{h,cc} module. + Iterator* NewIterator(); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + void Add(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value); + + private: + struct KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + int operator()(const char* a, const char* b) const; + }; + friend class MemTableIterator; + friend class MemTableBackwardIterator; + + typedef SkipList Table; + + KeyComparator comparator_; + Arena arena_; + Table table_; + + // No copying allowed + MemTable(const MemTable&); + void operator=(const MemTable&); +}; + +} + +#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/leveldb/db/repair.cc b/leveldb/db/repair.cc new file mode 100644 index 0000000..c8e7b9e --- /dev/null +++ b/leveldb/db/repair.cc @@ -0,0 +1,380 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// We recover the contents of the descriptor from the other files we find. +// (1) Any log files are first converted to tables +// (2) We scan every table to compute +// (a) smallest/largest for the table +// (b) largest sequence number in the table +// (3) We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, ...) +// in the table's meta section to speed up ScanTable. + +#include "db/builder.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/env.h" + +namespace leveldb { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const Options& options) + : dbname_(dbname), + env_(options.env), + icmp_(options.comparator), + options_(SanitizeOptions(dbname, &icmp_, options)), + owns_info_log_(options_.info_log != options.info_log), + next_file_number_(1) { + // TableCache can be small since we expect each table to be opened once. + table_cache_ = new TableCache(dbname_, &options_, 10); + } + + ~Repairer() { + delete table_cache_; + if (owns_info_log_) { + delete options_.info_log; + } + } + + Status Run() { + Status status = FindFiles(); + if (status.ok()) { + ConvertLogFilesToTables(); + ExtractMetaData(); + status = WriteDescriptor(); + } + if (status.ok()) { + unsigned long long bytes = 0; + for (size_t i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.file_size; + } + Log(env_, options_.info_log, + "**** Repaired leveldb %s; " + "recovered %d files; %llu bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), + static_cast(tables_.size()), + bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + SequenceNumber max_sequence; + }; + + std::string const dbname_; + Env* const env_; + InternalKeyComparator const icmp_; + Options const options_; + bool owns_info_log_; + TableCache* table_cache_; + VersionEdit edit_; + + std::vector manifests_; + std::vector table_numbers_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + + Status FindFiles() { + std::vector filenames; + Status status = env_->GetChildren(dbname_, &filenames); + if (!status.ok()) { + return status; + } + if (filenames.empty()) { + return Status::IOError(dbname_, "repair found no files"); + } + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kLogFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_numbers_.push_back(number); + } else { + // Ignore other files + } + } + } + } + return status; + } + + void ConvertLogFilesToTables() { + for (size_t i = 0; i < logs_.size(); i++) { + std::string logname = LogFileName(dbname_, logs_[i]); + Status status = ConvertLogToTable(logs_[i]); + if (!status.ok()) { + Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", + (unsigned long long) logs_[i], + status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + WritableFile* info_log; + uint64_t lognum; + virtual void Corruption(size_t bytes, const Status& s) { + // We print error messages for corruption, but continue repairing. + Log(env, info_log, "Log #%llu: dropping %d bytes; %s", + (unsigned long long) lognum, + static_cast(bytes), + s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(dbname_, log); + SequentialFile* lfile; + Status status = env_->NewSequentialFile(logname, &lfile); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.lognum = log; + // We intentially make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(lfile, &reporter, false/*do not checksum*/); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable mem(icmp_); + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + status = WriteBatchInternal::InsertInto(&batch, &mem); + if (status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + Log(env_, options_.info_log, "Log #%llu: ignoring %s", + (unsigned long long) log, + status.ToString().c_str()); + status = Status::OK(); // Keep going with rest of file + } + } + delete lfile; + + // We ignore any version edits generated by the conversion to a Table + // since ExtractMetaData() will also generate edits. + VersionEdit skipped; + FileMetaData meta; + meta.number = next_file_number_++; + Iterator* iter = mem.NewIterator(); + status = BuildTable(dbname_, env_, options_, table_cache_, iter, + &meta, &skipped); + delete iter; + if (status.ok()) { + if (meta.file_size > 0) { + table_numbers_.push_back(meta.number); + } + } + Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", + (unsigned long long) log, + counter, + (unsigned long long) meta.number, + status.ToString().c_str()); + return status; + } + + void ExtractMetaData() { + std::vector kept; + for (size_t i = 0; i < table_numbers_.size(); i++) { + TableInfo t; + t.meta.number = table_numbers_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName(dbname_, table_numbers_[i]); + Log(env_, options_.info_log, "Table #%llu: ignoring %s", + (unsigned long long) table_numbers_[i], + status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName(dbname_, t->meta.number); + int counter = 0; + Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (status.ok()) { + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), t->meta.number, t->meta.file_size); + bool empty = true; + ParsedInternalKey parsed; + t->max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(env_, options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t->meta.number, + EscapeString(key).c_str()); + continue; + } + + counter++; + if (empty) { + empty = false; + t->meta.smallest.DecodeFrom(key); + } + t->meta.largest.DecodeFrom(key); + if (parsed.sequence > t->max_sequence) { + t->max_sequence = parsed.sequence; + } + } + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; + } + Log(env_, options_.info_log, "Table #%llu: %d entries %s", + (unsigned long long) t->meta.number, + counter, + status.ToString().c_str()); + return status; + } + + Status WriteDescriptor() { + std::string tmp = TempFileName(dbname_, 1); + WritableFile* file; + Status status = env_->NewWritableFile(tmp, &file); + if (!status.ok()) { + return status; + } + + SequenceNumber max_sequence = 0; + for (size_t i = 0; i < tables_.size(); i++) { + if (max_sequence < tables_[i].max_sequence) { + max_sequence = tables_[i].max_sequence; + } + } + + edit_.SetComparatorName(icmp_.user_comparator()->Name()); + edit_.SetLogNumber(0); + edit_.SetNextFile(next_file_number_); + edit_.SetLastSequence(max_sequence); + + for (size_t i = 0; i < tables_.size(); i++) { + // TODO(opt): separate out into multiple levels + const TableInfo& t = tables_[i]; + edit_.AddFile(0, t.meta.number, t.meta.file_size, + t.meta.smallest, t.meta.largest); + } + + //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); + { + log::Writer log(file); + std::string record; + edit_.EncodeTo(&record); + status = log.AddRecord(record); + } + if (status.ok()) { + status = file->Close(); + } + delete file; + file = NULL; + + if (!status.ok()) { + env_->DeleteFile(tmp); + } else { + // Discard older manifests + for (size_t i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + + // Install new manifest + status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); + if (status.ok()) { + status = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(tmp); + } + } + return status; + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != NULL) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == NULL) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + Log(env_, options_.info_log, "Archiving %s: %s\n", + fname.c_str(), s.ToString().c_str()); + } +}; +} + +Status RepairDB(const std::string& dbname, const Options& options) { + Repairer repairer(dbname, options); + return repairer.Run(); +} + +} diff --git a/leveldb/db/skiplist.h b/leveldb/db/skiplist.h new file mode 100644 index 0000000..be39354 --- /dev/null +++ b/leveldb/db/skiplist.h @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... + +#include +#include +#include "port/port.h" +#include "util/arena.h" +#include "util/random.h" + +namespace leveldb { + +class Arena; + +template +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + enum { kMaxHeight = 12 }; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + inline int GetMaxHeight() const { + return reinterpret_cast(max_height_.NoBarrier_Load()); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return NULL if there is no such node. + // + // If prev is non-NULL, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template +struct SkipList::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template +typename SkipList::Node* +SkipList::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template +inline SkipList::Iterator::Iterator(const SkipList* list) { + list_ = list; + node_ = NULL; +} + +template +inline bool SkipList::Iterator::Valid() const { + return node_ != NULL; +} + +template +inline const Key& SkipList::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template +inline void SkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void SkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template +inline void SkipList::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, NULL); +} + +template +inline void SkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void SkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template +int SkipList::RandomHeight() { + // Increase height with probability 1 in kBranching + static const unsigned int kBranching = 4; + int height = 1; + while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight); + return height; +} + +template +bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { + // NULL n is considered infinite + return (n != NULL) && (compare_(n->key, key) < 0); +} + +template +typename SkipList::Node* SkipList::FindGreaterOrEqual(const Key& key, Node** prev) + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != NULL) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template +typename SkipList::Node* +SkipList::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == NULL || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +typename SkipList::Node* SkipList::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == NULL) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +SkipList::SkipList(Comparator cmp, Arena* arena) + : compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, kMaxHeight)), + max_height_(reinterpret_cast(1)), + rnd_(0xdeadbeef) { + for (int i = 0; i < kMaxHeight; i++) { + head_->SetNext(i, NULL); + } +} + +template +void SkipList::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* prev[kMaxHeight]; + Node* x = FindGreaterOrEqual(key, prev); + + // Our data structure does not allow duplicate insertion + assert(x == NULL || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (NULL), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since NULL sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); + prev[i]->SetNext(i, x); + } +} + +template +bool SkipList::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, NULL); + if (x != NULL && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} diff --git a/leveldb/db/skiplist_test.cc b/leveldb/db/skiplist_test.cc new file mode 100644 index 0000000..5f9ec0d --- /dev/null +++ b/leveldb/db/skiplist_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/skiplist.h" +#include +#include "leveldb/env.h" +#include "util/arena.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { + +typedef uint64_t Key; + +struct Comparator { + int operator()(const Key& a, const Key& b) const { + if (a < b) { + return -1; + } else if (a > b) { + return +1; + } else { + return 0; + } + } +}; + +class SkipTest { }; + +TEST(SkipTest, Empty) { + Arena arena; + Comparator cmp; + SkipList list(cmp, &arena); + ASSERT_TRUE(!list.Contains(10)); + + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + iter.Seek(100); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToLast(); + ASSERT_TRUE(!iter.Valid()); +} + +TEST(SkipTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + Arena arena; + Comparator cmp; + SkipList list(cmp, &arena); + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + list.Insert(key); + } + } + + for (int i = 0; i < R; i++) { + if (list.Contains(i)) { + ASSERT_EQ(keys.count(i), 1); + } else { + ASSERT_EQ(keys.count(i), 0); + } + } + + // Simple iterator tests + { + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + iter.Seek(0); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), iter.key()); + } + + // Forward iteration test + for (int i = 0; i < R; i++) { + SkipList::Iterator iter(&list); + iter.Seek(i); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + { + SkipList::Iterator iter(&list); + iter.SeekToLast(); + + // Compare against model iterator + for (std::set::reverse_iterator model_iter = keys.rbegin(); + model_iter != keys.rend(); + ++model_iter) { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + iter.Prev(); + } + ASSERT_TRUE(!iter.Valid()); + } +} + +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + private: + static const uint32_t K = 4; + + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = { k, g }; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + port::AtomicPointer generation[K]; + void Set(int k, intptr_t v) { + generation[k].Release_Store(reinterpret_cast(v)); + } + intptr_t Get(int k) { + return reinterpret_cast(generation[k].Acquire_Load()); + } + + State() { + for (int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + Arena arena_; + + // SkipList is not protected by mu_. We just use a single writer + // thread to modify it. + SkipList list_; + + public: + ConcurrentTest() : list_(Comparator(), &arena_) { } + + // REQUIRES: External synchronization + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const intptr_t g = current_.Get(k) + 1; + const Key key = MakeKey(k, g); + list_.Insert(key); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + SkipList::Iterator iter(&list_); + iter.Seek(pos); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = iter.key(); + ASSERT_TRUE(IsValidKey(current)) << std::hex << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << std::hex << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0) || + (gen(pos) > initial_state.Get(key(pos))) + ) << "key: " << key(pos) + << "; gen: " << gen(pos) + << "; initgen: " + << initial_state.Get(key(pos)); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(new_target); + } + } + } + } +}; +const uint32_t ConcurrentTest::K; + +// Simple test that does single-threaded testing of the ConcurrentTest +// scaffolding. +TEST(SkipTest, ConcurrentWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + test.WriteStep(&rnd); + } +} + +class TestState { + public: + ConcurrentTest t_; + int seed_; + port::AtomicPointer quit_flag_; + + enum ReaderState { + STARTING, + RUNNING, + DONE + }; + + explicit TestState(int s) + : seed_(s), + quit_flag_(NULL), + state_(STARTING), + state_cv_(&mu_) {} + + void Wait(ReaderState s) { + mu_.Lock(); + while (state_ != s) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + void Change(ReaderState s) { + mu_.Lock(); + state_ = s; + state_cv_.Signal(); + mu_.Unlock(); + } + + private: + port::Mutex mu_; + ReaderState state_; + port::CondVar state_cv_; +}; + +static void ConcurrentReader(void* arg) { + TestState* state = reinterpret_cast(arg); + Random rnd(state->seed_); + int64_t reads = 0; + state->Change(TestState::RUNNING); + while (!state->quit_flag_.Acquire_Load()) { + state->t_.ReadStep(&rnd); + ++reads; + } + state->Change(TestState::DONE); +} + +static void RunConcurrent(int run) { + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int i = 0; i < kSize; i++) { + state.t_.WriteStep(&rnd); + } + state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do + state.Wait(TestState::DONE); + } +} + +TEST(SkipTest, Concurrent1) { RunConcurrent(1); } +TEST(SkipTest, Concurrent2) { RunConcurrent(2); } +TEST(SkipTest, Concurrent3) { RunConcurrent(3); } +TEST(SkipTest, Concurrent4) { RunConcurrent(4); } +TEST(SkipTest, Concurrent5) { RunConcurrent(5); } + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/db/snapshot.h b/leveldb/db/snapshot.h new file mode 100644 index 0000000..9a90756 --- /dev/null +++ b/leveldb/db/snapshot.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ +#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ + +#include "leveldb/db.h" + +namespace leveldb { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each Snapshot corresponds to a particular sequence number. +class Snapshot { + public: + SequenceNumber number_; // const after creation + + private: + friend class SnapshotList; + + // Snapshot is kept in a doubly-linked circular list + Snapshot* prev_; + Snapshot* next_; + + SnapshotList* list_; // just for sanity checks +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + } + + bool empty() const { return list_.next_ == &list_; } + Snapshot* oldest() const { assert(!empty()); return list_.next_; } + Snapshot* newest() const { assert(!empty()); return list_.prev_; } + + const Snapshot* New(SequenceNumber seq) { + Snapshot* s = new Snapshot; + s->number_ = seq; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + return s; + } + + void Delete(const Snapshot* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + delete s; + } + + private: + // Dummy head of doubly-linked list of snapshots + Snapshot list_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/leveldb/db/table_cache.cc b/leveldb/db/table_cache.cc new file mode 100644 index 0000000..325d707 --- /dev/null +++ b/leveldb/db/table_cache.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/filename.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "util/coding.h" + +namespace leveldb { + +struct TableAndFile { + RandomAccessFile* file; + Table* table; +}; + +static void DeleteEntry(const Slice& key, void* value) { + TableAndFile* tf = reinterpret_cast(value); + delete tf->table; + delete tf->file; + delete tf; +} + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +TableCache::TableCache(const std::string& dbname, + const Options* options, + int entries) + : env_(options->env), + dbname_(dbname), + options_(options), + cache_(NewLRUCache(entries)) { +} + +TableCache::~TableCache() { + delete cache_; +} + +Iterator* TableCache::NewIterator(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + Table** tableptr) { + if (tableptr != NULL) { + *tableptr = NULL; + } + + char buf[sizeof(file_number)]; + EncodeFixed64(buf, file_number); + Slice key(buf, sizeof(buf)); + Cache::Handle* handle = cache_->Lookup(key); + if (handle == NULL) { + std::string fname = TableFileName(dbname_, file_number); + RandomAccessFile* file = NULL; + Table* table = NULL; + Status s = env_->NewRandomAccessFile(fname, &file); + if (s.ok()) { + s = Table::Open(*options_, file, file_size, &table); + } + + if (!s.ok()) { + assert(table == NULL); + delete file; + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + return NewErrorIterator(s); + } + + TableAndFile* tf = new TableAndFile; + tf->file = file; + tf->table = table; + handle = cache_->Insert(key, tf, 1, &DeleteEntry); + } + + Table* table = reinterpret_cast(cache_->Value(handle))->table; + Iterator* result = table->NewIterator(options); + result->RegisterCleanup(&UnrefEntry, cache_, handle); + if (tableptr != NULL) { + *tableptr = table; + } + return result; +} + +void TableCache::Evict(uint64_t file_number) { + char buf[sizeof(file_number)]; + EncodeFixed64(buf, file_number); + cache_->Erase(Slice(buf, sizeof(buf))); +} + +} diff --git a/leveldb/db/table_cache.h b/leveldb/db/table_cache.h new file mode 100644 index 0000000..5376194 --- /dev/null +++ b/leveldb/db/table_cache.h @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ +#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ + +#include +#include +#include "db/dbformat.h" +#include "leveldb/cache.h" +#include "leveldb/table.h" +#include "port/port.h" + +namespace leveldb { + +class Env; + +class TableCache { + public: + TableCache(const std::string& dbname, const Options* options, int entries); + ~TableCache(); + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "tableptr" is + // non-NULL, also sets "*tableptr" to point to the Table object + // underlying the returned iterator, or NULL if no Table object underlies + // the returned iterator. The returned "*tableptr" object is owned by + // the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + Iterator* NewIterator(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + Table** tableptr = NULL); + + // Evict any entry for the specified file number + void Evict(uint64_t file_number); + + private: + Env* const env_; + const std::string dbname_; + const Options* options_; + Cache* cache_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/leveldb/db/version_edit.cc b/leveldb/db/version_edit.cc new file mode 100644 index 0000000..3941271 --- /dev/null +++ b/leveldb/db/version_edit.cc @@ -0,0 +1,268 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/version_set.h" +#include "util/coding.h" + +namespace leveldb { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. +enum Tag { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, +}; + +void VersionEdit::Clear() { + comparator_.clear(); + log_number_ = 0; + prev_log_number_ = 0; + last_sequence_ = 0; + next_file_number_ = 0; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_last_sequence_ = false; + deleted_files_.clear(); + new_files_.clear(); +} + +void VersionEdit::EncodeTo(std::string* dst) const { + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32(dst, kLogNumber); + PutVarint64(dst, log_number_); + } + if (has_prev_log_number_) { + PutVarint32(dst, kPrevLogNumber); + PutVarint64(dst, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32(dst, kNextFileNumber); + PutVarint64(dst, next_file_number_); + } + if (has_last_sequence_) { + PutVarint32(dst, kLastSequence); + PutVarint64(dst, last_sequence_); + } + + for (size_t i = 0; i < compact_pointers_.size(); i++) { + PutVarint32(dst, kCompactPointer); + PutVarint32(dst, compact_pointers_[i].first); // level + PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); + } + + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + PutVarint32(dst, kDeletedFile); + PutVarint32(dst, iter->first); // level + PutVarint64(dst, iter->second); // file number + } + + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + PutVarint32(dst, kNewFile); + PutVarint32(dst, new_files_[i].first); // level + PutVarint64(dst, f.number); + PutVarint64(dst, f.file_size); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + } +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return true; + } else { + return false; + } +} + +static bool GetLevel(Slice* input, int* level) { + uint32_t v; + if (GetVarint32(input, &v) && + v < config::kNumLevels) { + *level = v; + return true; + } else { + return false; + } +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); + Slice input = src; + const char* msg = NULL; + uint32_t tag; + + // Temporary storage for parsing + int level; + uint64_t number; + FileMetaData f; + Slice str; + InternalKey key; + + while (msg == NULL && GetVarint32(&input, &tag)) { + switch (tag) { + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactPointer: + if (GetLevel(&input, &level) && + GetInternalKey(&input, &key)) { + compact_pointers_.push_back(std::make_pair(level, key)); + } else { + msg = "compaction pointer"; + } + break; + + case kDeletedFile: + if (GetLevel(&input, &level) && + GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + msg = "deleted file"; + } + break; + + case kNewFile: + if (GetLevel(&input, &level) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + new_files_.push_back(std::make_pair(level, f)); + } else { + msg = "new-file entry"; + } + break; + + default: + msg = "unknown tag"; + break; + } + } + + if (msg == NULL && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != NULL) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString() const { + std::string r; + r.append("VersionEdit {"); + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFile: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (size_t i = 0; i < compact_pointers_.size(); i++) { + r.append("\n CompactPointer: "); + AppendNumberTo(&r, compact_pointers_[i].first); + r.append(" '"); + AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); + r.append("'"); + } + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, iter->first); + r.append(" "); + AppendNumberTo(&r, iter->second); + } + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.number); + r.append(" "); + AppendNumberTo(&r, f.file_size); + r.append(" '"); + AppendEscapedStringTo(&r, f.smallest.Encode()); + r.append("' .. '"); + AppendEscapedStringTo(&r, f.largest.Encode()); + r.append("'"); + } + r.append("\n}\n"); + return r; +} + +} diff --git a/leveldb/db/version_edit.h b/leveldb/db/version_edit.h new file mode 100644 index 0000000..ab874da --- /dev/null +++ b/leveldb/db/version_edit.h @@ -0,0 +1,106 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ +#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ + +#include +#include +#include +#include "db/dbformat.h" + +namespace leveldb { + +class VersionSet; + +struct FileMetaData { + int refs; + uint64_t number; + uint64_t file_size; // File size in bytes + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + + FileMetaData() : refs(0), file_size(0) { } +}; + +class VersionEdit { + public: + VersionEdit() { Clear(); } + ~VersionEdit() { } + + void Clear(); + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + void SetCompactPointer(int level, const InternalKey& key) { + compact_pointers_.push_back(std::make_pair(level, key)); + } + + // Add the specified file at the specified number. + // REQUIRES: This version has not been saved (see VersionSet::SaveTo) + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + void AddFile(int level, uint64_t file, + uint64_t file_size, + const InternalKey& smallest, + const InternalKey& largest) { + FileMetaData f; + f.number = file; + f.file_size = file_size; + f.smallest = smallest; + f.largest = largest; + new_files_.push_back(std::make_pair(level, f)); + } + + // Delete the specified "file" from the specified "level". + void DeleteFile(int level, uint64_t file) { + deleted_files_.insert(std::make_pair(level, file)); + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString() const; + + private: + friend class VersionSet; + + typedef std::set< std::pair > DeletedFileSet; + + std::string comparator_; + uint64_t log_number_; + uint64_t prev_log_number_; + uint64_t next_file_number_; + SequenceNumber last_sequence_; + bool has_comparator_; + bool has_log_number_; + bool has_prev_log_number_; + bool has_next_file_number_; + bool has_last_sequence_; + + std::vector< std::pair > compact_pointers_; + DeletedFileSet deleted_files_; + std::vector< std::pair > new_files_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/leveldb/db/version_edit_test.cc b/leveldb/db/version_edit_test.cc new file mode 100644 index 0000000..67959f7 --- /dev/null +++ b/leveldb/db/version_edit_test.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" +#include "util/testharness.h" + +namespace leveldb { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest { }; + +TEST(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", kBig + 500 + i, kTypeValue), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); + edit.DeleteFile(4, kBig + 700 + i); + edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/db/version_set.cc b/leveldb/db/version_set.cc new file mode 100644 index 0000000..c439f49 --- /dev/null +++ b/leveldb/db/version_set.cc @@ -0,0 +1,1027 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include +#include +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "leveldb/env.h" +#include "leveldb/table_builder.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +static const int kTargetFileSize = 2 * 1048576; + +// Maximum bytes of overlaps in grandparent (i.e., level+2) before we +// stop building a single file in a level->level+1 compaction. +static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; + +static double MaxBytesForLevel(int level) { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + double result = 10 * 1048576.0; // Result for both level-0 and level-1 + while (level > 1) { + result *= 10; + level--; + } + return result; +} + +static uint64_t MaxFileSizeForLevel(int level) { + return kTargetFileSize; // We could vary per level to reduce number of files? +} + +namespace { +std::string IntSetToString(const std::set& s) { + std::string result = "{"; + for (std::set::const_iterator it = s.begin(); + it != s.end(); + ++it) { + result += (result.size() > 1) ? "," : ""; + result += NumberToString(*it); + } + result += "}"; + return result; +} +} + +Version::~Version() { + assert(refs_ == 0); + for (int level = 0; level < config::kNumLevels; level++) { + for (size_t i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + assert(f->refs >= 0); + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + delete cleanup_mem_; +} + +// An internal iterator. For a given version/level pair, yields +// information about the files in the level. For a given entry, key() +// is the largest key that occurs in the file, and value() is an +// 16-byte value containing the file number and file size, both +// encoded using EncodeFixed64. +class Version::LevelFileNumIterator : public Iterator { + public: + LevelFileNumIterator(const Version* version, + const std::vector* flist) + : icmp_(version->vset_->icmp_.user_comparator()), + flist_(flist), + index_(flist->size()) { // Marks as invalid + } + virtual bool Valid() const { + return index_ < flist_->size(); + } + virtual void Seek(const Slice& target) { + uint32_t left = 0; + uint32_t right = flist_->size() - 1; + while (left < right) { + uint32_t mid = (left + right) / 2; + int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target); + if (cmp < 0) { + // Key at "mid.largest" is < than "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + index_ = left; + } + virtual void SeekToFirst() { index_ = 0; } + virtual void SeekToLast() { + index_ = flist_->empty() ? 0 : flist_->size() - 1; + } + virtual void Next() { + assert(Valid()); + index_++; + } + virtual void Prev() { + assert(Valid()); + if (index_ == 0) { + index_ = flist_->size(); // Marks as invalid + } else { + index_--; + } + } + Slice key() const { + assert(Valid()); + return (*flist_)[index_]->largest.Encode(); + } + Slice value() const { + assert(Valid()); + EncodeFixed64(value_buf_, (*flist_)[index_]->number); + EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); + return Slice(value_buf_, sizeof(value_buf_)); + } + virtual Status status() const { return Status::OK(); } + private: + const InternalKeyComparator icmp_; + const std::vector* const flist_; + uint32_t index_; + + // Backing store for value(). Holds the file number and size. + mutable char value_buf_[16]; +}; + +static Iterator* GetFileIterator(void* arg, + const ReadOptions& options, + const Slice& file_value) { + TableCache* cache = reinterpret_cast(arg); + if (file_value.size() != 16) { + return NewErrorIterator( + Status::Corruption("FileReader invoked with unexpected value")); + } else { + return cache->NewIterator(options, + DecodeFixed64(file_value.data()), + DecodeFixed64(file_value.data() + 8)); + } +} + +Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, + int level) const { + return NewTwoLevelIterator( + new LevelFileNumIterator(this, &files_[level]), + &GetFileIterator, vset_->table_cache_, options); +} + +void Version::AddIterators(const ReadOptions& options, + std::vector* iters) { + // Merge all level zero files together since they may overlap + for (size_t i = 0; i < files_[0].size(); i++) { + iters->push_back( + vset_->table_cache_->NewIterator( + options, files_[0][i]->number, files_[0][i]->file_size)); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < config::kNumLevels; level++) { + if (!files_[level].empty()) { + iters->push_back(NewConcatenatingIterator(options, level)); + } + } +} + +void Version::Ref() { + ++refs_; +} + +void Version::Unref() { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + vset_->MaybeDeleteOldVersions(); + // TODO: try to delete obsolete files + } +} + +std::string Version::DebugString() const { + std::string r; + for (int level = 0; level < config::kNumLevels; level++) { + // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g'] + r.append("level "); + AppendNumberTo(&r, level); + r.push_back(':'); + const std::vector& files = files_[level]; + for (size_t i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->number); + r.push_back(':'); + AppendNumberTo(&r, files[i]->file_size); + r.append("['"); + AppendEscapedStringTo(&r, files[i]->smallest.Encode()); + r.append("' .. '"); + AppendEscapedStringTo(&r, files[i]->largest.Encode()); + r.append("']"); + } + r.push_back('\n'); + } + return r; +} + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionSet::Builder { + private: + typedef std::map FileMap; + VersionSet* vset_; + FileMap files_[config::kNumLevels]; + + public: + // Initialize a builder with the files from *base and other info from *vset + Builder(VersionSet* vset, Version* base) + : vset_(vset) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = base->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + FileMetaData* f = files[i]; + f->refs++; + files_[level].insert(std::make_pair(f->number, f)); + } + } + } + + ~Builder() { + for (int level = 0; level < config::kNumLevels; level++) { + const FileMap& fmap = files_[level]; + for (FileMap::const_iterator iter = fmap.begin(); + iter != fmap.end(); + ++iter) { + FileMetaData* f = iter->second; + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + } + + // Apply all of the edits in *edit to the current state. + void Apply(VersionEdit* edit) { + // Update compaction pointers + for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { + const int level = edit->compact_pointers_[i].first; + vset_->compact_pointer_[level] = + edit->compact_pointers_[i].second.Encode().ToString(); + } + + // Delete files + const VersionEdit::DeletedFileSet& del = edit->deleted_files_; + for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); + iter != del.end(); + ++iter) { + const int level = iter->first; + const uint64_t number = iter->second; + FileMap::iterator fiter = files_[level].find(number); + assert(fiter != files_[level].end()); // Sanity check for debug mode + if (fiter != files_[level].end()) { + FileMetaData* f = fiter->second; + f->refs--; + if (f->refs <= 0) { + delete f; + } + files_[level].erase(fiter); + } + } + + // Add new files + for (size_t i = 0; i < edit->new_files_.size(); i++) { + const int level = edit->new_files_[i].first; + FileMetaData* f = new FileMetaData(edit->new_files_[i].second); + f->refs = 1; + assert(files_[level].count(f->number) == 0); + files_[level].insert(std::make_pair(f->number, f)); + } + } + + // Save the current state in *v. + void SaveTo(Version* v) { + for (int level = 0; level < config::kNumLevels; level++) { + const FileMap& fmap = files_[level]; + for (FileMap::const_iterator iter = fmap.begin(); + iter != fmap.end(); + ++iter) { + FileMetaData* f = iter->second; + f->refs++; + v->files_[level].push_back(f); + } + } + } +}; + +VersionSet::VersionSet(const std::string& dbname, + const Options* options, + TableCache* table_cache, + const InternalKeyComparator* cmp) + : env_(options->env), + dbname_(dbname), + options_(options), + table_cache_(table_cache), + icmp_(*cmp), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + last_sequence_(0), + log_number_(0), + prev_log_number_(0), + descriptor_file_(NULL), + descriptor_log_(NULL), + current_(new Version(this)), + oldest_(current_) { +} + +VersionSet::~VersionSet() { + for (Version* v = oldest_; v != NULL; ) { + Version* next = v->next_; + assert(v->refs_ == 0); + delete v; + v = next; + } + delete descriptor_log_; + delete descriptor_file_; +} + +Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { + if (edit->has_log_number_) { + assert(edit->log_number_ >= log_number_); + assert(edit->log_number_ < next_file_number_); + } else { + edit->SetLogNumber(log_number_); + } + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + + edit->SetNextFile(next_file_number_); + edit->SetLastSequence(last_sequence_); + + Version* v = new Version(this); + { + Builder builder(this, current_); + builder.Apply(edit); + builder.SaveTo(v); + } + + std::string new_manifest_file; + Status s = Finalize(v); + + // Initialize new descriptor log file if necessary by creating + // a temporary file that contains a snapshot of the current version. + if (s.ok()) { + if (descriptor_log_ == NULL) { + assert(descriptor_file_ == NULL); + new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); + edit->SetNextFile(next_file_number_); + s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); + if (s.ok()) { + descriptor_log_ = new log::Writer(descriptor_file_); + s = WriteSnapshot(descriptor_log_); + } + } + } + + // Write new record to MANIFEST log + if (s.ok()) { + std::string record; + edit->EncodeTo(&record); + s = descriptor_log_->AddRecord(record); + if (s.ok()) { + s = descriptor_file_->Sync(); + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && !new_manifest_file.empty()) { + s = SetCurrentFile(env_, dbname_, manifest_file_number_); + } + + // Install the new version + if (s.ok()) { + assert(current_->next_ == NULL); + assert(current_->cleanup_mem_ == NULL); + current_->cleanup_mem_ = cleanup_mem; + v->next_ = NULL; + current_->next_ = v; + current_ = v; + log_number_ = edit->log_number_; + prev_log_number_ = edit->prev_log_number_; + } else { + delete v; + if (!new_manifest_file.empty()) { + delete descriptor_log_; + delete descriptor_file_; + descriptor_log_ = NULL; + descriptor_file_ = NULL; + env_->DeleteFile(new_manifest_file); + } + } + + return s; +} + +Status VersionSet::Recover() { + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string current; + Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); + if (!s.ok()) { + return s; + } + if (current.empty() || current[current.size()-1] != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + current.resize(current.size() - 1); + + std::string dscname = dbname_ + "/" + current; + SequentialFile* file; + s = env_->NewSequentialFile(dscname, &file); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; + Builder builder(this, current_); + + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(file, &reporter, true/*checksum*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (s.ok()) { + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument( + edit.comparator_ + "does not match existing comparator ", + icmp_.user_comparator()->Name()); + } + } + + if (s.ok()) { + builder.Apply(&edit); + } + + if (edit.has_log_number_) { + log_number = edit.log_number_; + have_log_number = true; + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + delete file; + file = NULL; + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + } + + if (s.ok()) { + Version* v = new Version(this); + builder.SaveTo(v); + s = Finalize(v); + if (!s.ok()) { + delete v; + } else { + // Install recovered version + v->next_ = NULL; + current_->next_ = v; + current_ = v; + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + log_number_ = log_number; + prev_log_number_ = prev_log_number; + } + } + + return s; +} + +static int64_t TotalFileSize(const std::vector& files) { + int64_t sum = 0; + for (size_t i = 0; i < files.size(); i++) { + sum += files[i]->file_size; + } + return sum; +} + +Status VersionSet::Finalize(Version* v) { + // Precomputed best level for next compaction + int best_level = -1; + double best_score = -1; + + Status s; + for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { + s = SortLevel(v, level); + + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + score = v->files_[level].size() / 4.0; + } else { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = TotalFileSize(v->files_[level]); + score = static_cast(level_bytes) / MaxBytesForLevel(level); + } + + if (score > best_score) { + best_level = level; + best_score = score; + } + } + + v->compaction_level_ = best_level; + v->compaction_score_ = best_score; + return s; +} + +Status VersionSet::WriteSnapshot(log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // Save metadata + VersionEdit edit; + edit.SetComparatorName(icmp_.user_comparator()->Name()); + + // Save compaction pointers + for (int level = 0; level < config::kNumLevels; level++) { + if (!compact_pointer_[level].empty()) { + InternalKey key; + key.DecodeFrom(compact_pointer_[level]); + edit.SetCompactPointer(level, key); + } + } + + // Save files + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = current_->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); + } + } + + std::string record; + edit.EncodeTo(&record); + return log->AddRecord(record); +} + +// Helper to sort by tables_[file_number].smallest +struct VersionSet::BySmallestKey { + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + return internal_comparator->Compare(f1->smallest, f2->smallest) < 0; + } +}; + +Status VersionSet::SortLevel(Version* v, uint64_t level) { + Status result; + BySmallestKey cmp; + cmp.internal_comparator = &icmp_; + std::sort(v->files_[level].begin(), v->files_[level].end(), cmp); + + if (result.ok() && level > 0) { + // There should be no overlap + for (size_t i = 1; i < v->files_[level].size(); i++) { + const InternalKey& prev_end = v->files_[level][i-1]->largest; + const InternalKey& this_begin = v->files_[level][i]->smallest; + if (icmp_.Compare(prev_end, this_begin) >= 0) { + result = Status::Corruption( + "overlapping ranges in same level", + (EscapeString(prev_end.Encode()) + " vs. " + + EscapeString(this_begin.Encode()))); + break; + } + } + } + return result; +} + +int VersionSet::NumLevelFiles(int level) const { + assert(level >= 0); + assert(level < config::kNumLevels); + return current_->files_[level].size(); +} + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { + uint64_t result = 0; + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + if (icmp_.Compare(files[i]->largest, ikey) <= 0) { + // Entire file is before "ikey", so just add the file size + result += files[i]->file_size; + } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { + // Entire file is after "ikey", so ignore + if (level > 0) { + // Files other than level 0 are sorted by meta->smallest, so + // no further files in this level will contain data for + // "ikey". + break; + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + Table* tableptr; + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); + if (tableptr != NULL) { + result += tableptr->ApproximateOffsetOf(ikey.Encode()); + } + delete iter; + } + } + } + return result; +} + +void VersionSet::MaybeDeleteOldVersions() { + // Note: it is important to delete versions in order since a newer + // version with zero refs may be holding a pointer to a memtable + // that is used by somebody who has a ref on an older version. + while (oldest_ != current_ && oldest_->refs_ == 0) { + Version* next = oldest_->next_; + delete oldest_; + oldest_ = next; + } +} + +void VersionSet::AddLiveFiles(std::set* live) { + for (Version* v = oldest_; v != NULL; v = v->next_) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + live->insert(files[i]->number); + } + } + } +} + +int64_t VersionSet::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < config::kNumLevels); + return TotalFileSize(current_->files_[level]); +} + +int64_t VersionSet::MaxNextLevelOverlappingBytes() { + int64_t result = 0; + std::vector overlaps; + for (int level = 0; level < config::kNumLevels - 1; level++) { + for (size_t i = 0; i < current_->files_[level].size(); i++) { + const FileMetaData* f = current_->files_[level][i]; + GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); + const int64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +void VersionSet::GetOverlappingInputs( + int level, + const InternalKey& begin, + const InternalKey& end, + std::vector* inputs) { + inputs->clear(); + Slice user_begin = begin.user_key(); + Slice user_end = end.user_key(); + const Comparator* user_cmp = icmp_.user_comparator(); + for (size_t i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || + user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { + // Either completely before or after range; skip it + } else { + inputs->push_back(f); + } + } +} + +// Stores the minimal range that covers all entries in inputs in +// *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest) { + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_.Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_.Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } +} + +// Stores the minimal range that covers all entries in inputs1 and inputs2 +// in *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + +Iterator* VersionSet::MakeInputIterator(Compaction* c) { + ReadOptions options; + options.verify_checksums = options_->paranoid_checks; + options.fill_cache = false; + + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); + Iterator** list = new Iterator*[space]; + int num = 0; + for (int which = 0; which < 2; which++) { + if (!c->inputs_[which].empty()) { + if (c->level() + which == 0) { + const std::vector& files = c->inputs_[which]; + for (size_t i = 0; i < files.size(); i++) { + list[num++] = table_cache_->NewIterator( + options, files[i]->number, files[i]->file_size); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = NewTwoLevelIterator( + new Version::LevelFileNumIterator( + c->input_version_, &c->inputs_[which]), + &GetFileIterator, table_cache_, options); + } + } + } + assert(num <= space); + Iterator* result = NewMergingIterator(&icmp_, list, num); + delete[] list; + return result; +} + +Compaction* VersionSet::PickCompaction() { + if (!NeedsCompaction()) { + return NULL; + } + const int level = current_->compaction_level_; + assert(level >= 0); + assert(level+1 < config::kNumLevels); + + Compaction* c = new Compaction(level); + c->input_version_ = current_; + c->input_version_->Ref(); + + // Pick the first file that comes after compact_pointer_[level] + for (size_t i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (compact_pointer_[level].empty() || + icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { + c->inputs_[0].push_back(f); + break; + } + } + if (c->inputs_[0].empty()) { + // Wrap-around to the beginning of the key space + c->inputs_[0].push_back(current_->files_[level][0]); + } + + // Files in level 0 may overlap each other, so pick up all overlapping ones + if (level == 0) { + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); + assert(!c->inputs_[0].empty()); + } + + SetupOtherInputs(c); + + return c; +} + +void VersionSet::SetupOtherInputs(Compaction* c) { + const int level = c->level(); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + + GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + + // See if we can grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. + if (!c->inputs_[1].empty()) { + std::vector expanded0; + GetOverlappingInputs(level, all_start, all_limit, &expanded0); + if (expanded0.size() > c->inputs_[0].size()) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); + if (expanded1.size() == c->inputs_[1].size()) { + Log(env_, options_->info_log, + "Expanding@%d %d+%d to %d+%d\n", + level, + int(c->inputs_[0].size()), + int(c->inputs_[1].size()), + int(expanded0.size()), + int(expanded1.size())); + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < config::kNumLevels) { + GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); + } + + if (false) { + Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", + level, + EscapeString(smallest.Encode()).c_str(), + EscapeString(largest.Encode()).c_str()); + } + + // Update the place where we will do the next compaction for this level. + // We update this immediately instead of waiting for the VersionEdit + // to be applied so that if the compaction fails, we will try a different + // key range next time. + compact_pointer_[level] = largest.Encode().ToString(); + c->edit_.SetCompactPointer(level, largest); +} + +Compaction* VersionSet::CompactRange( + int level, + const InternalKey& begin, + const InternalKey& end) { + std::vector inputs; + GetOverlappingInputs(level, begin, end, &inputs); + if (inputs.empty()) { + return NULL; + } + + Compaction* c = new Compaction(level); + c->input_version_ = current_; + c->input_version_->Ref(); + c->inputs_[0] = inputs; + SetupOtherInputs(c); + return c; +} + +Compaction::Compaction(int level) + : level_(level), + max_output_file_size_(MaxFileSizeForLevel(level)), + input_version_(NULL), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0) { + for (int i = 0; i < config::kNumLevels; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + if (input_version_ != NULL) { + input_version_->Unref(); + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + return (num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes); +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const InternalKey& key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != NULL) { + input_version_->Unref(); + input_version_ = NULL; + } +} + +} diff --git a/leveldb/db/version_set.h b/leveldb/db/version_set.h new file mode 100644 index 0000000..e377513 --- /dev/null +++ b/leveldb/db/version_set.h @@ -0,0 +1,308 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of Table files per level. The +// entire set of versions is maintained in a VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ +#define STORAGE_LEVELDB_DB_VERSION_SET_H_ + +#include +#include +#include +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/port.h" + +namespace leveldb { + +namespace log { class Writer; } + +class Compaction; +class Iterator; +class MemTable; +class TableBuilder; +class TableCache; +class Version; +class VersionSet; +class WritableFile; + +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, std::vector* iters); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + void Unref(); + + // Return a human readable string that describes this version's contents. + std::string DebugString() const; + + private: + friend class Compaction; + friend class VersionSet; + + class LevelFileNumIterator; + Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; + + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + int refs_; // Number of live refs to this version + MemTable* cleanup_mem_; // NULL, or table to delete when version dropped + + // List of files per level + std::vector files_[config::kNumLevels]; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + double compaction_score_; + int compaction_level_; + + explicit Version(VersionSet* vset) + : vset_(vset), next_(NULL), refs_(0), + cleanup_mem_(NULL), + compaction_score_(-1), + compaction_level_(-1) { + } + + ~Version(); + + // No copying allowed + Version(const Version&); + void operator=(const Version&); +}; + +class VersionSet { + public: + VersionSet(const std::string& dbname, + const Options* options, + TableCache* table_cache, + const InternalKeyComparator*); + ~VersionSet(); + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Iff Apply() returns OK, arrange to delete + // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed + // by older versions. + Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); + + // Recover the last saved descriptor from persistent storage. + Status Recover(); + + // Save current contents to *log + Status WriteSnapshot(log::Writer* log); + + // Return the current version. + Version* current() const { return current_; } + + // Return the current manifest file number + uint64_t ManifestFileNumber() const { return manifest_file_number_; } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_++; } + + // Return the number of Table files at the specified level. + int NumLevelFiles(int level) const; + + // Return the combined file size of all files at the specified level. + int64_t NumLevelBytes(int level) const; + + // Return the last sequence number. + uint64_t LastSequence() const { return last_sequence_; } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + last_sequence_ = s; + } + + // Return the current log file number. + uint64_t LogNumber() const { return log_number_; } + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t PrevLogNumber() const { return prev_log_number_; } + + // Pick level and inputs for a new compaction. + // Returns NULL if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + Compaction* PickCompaction(); + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns NULL if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + Compaction* CompactRange( + int level, + const InternalKey& begin, + const InternalKey& end); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + Iterator* MakeInputIterator(Compaction* c); + + // Returns true iff some level needs a compaction. + bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } + + // Add all files listed in any live version to *live. + // May also mutate some internal state. + void AddLiveFiles(std::set* live); + + // Return the approximate offset in the database of the data for + // "key" as of version "v". + uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); + + private: + class Builder; + + friend class Compaction; + friend class Version; + + Status Finalize(Version* v); + + // Delete any old versions that are no longer needed. + void MaybeDeleteOldVersions(); + + struct BySmallestKey; + Status SortLevel(Version* v, uint64_t level); + + void GetOverlappingInputs( + int level, + const InternalKey& begin, + const InternalKey& end, + std::vector* inputs); + + void GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest); + + void GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest); + + void SetupOtherInputs(Compaction* c); + + Env* const env_; + const std::string dbname_; + const Options* const options_; + TableCache* const table_cache_; + const InternalKeyComparator icmp_; + uint64_t next_file_number_; + uint64_t manifest_file_number_; + uint64_t last_sequence_; + uint64_t log_number_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + // Opened lazily + WritableFile* descriptor_file_; + log::Writer* descriptor_log_; + + // Versions are kept in a singly linked list that is never empty + Version* current_; // Pointer to the last (newest) list entry + Version* oldest_; // Pointer to the first (oldest) list entry + + // Per-level key at which the next compaction at that level should start. + // Either an empty string, or a valid InternalKey. + std::string compact_pointer_[config::kNumLevels]; + + // No copying allowed + VersionSet(const VersionSet&); + void operator=(const VersionSet&); +}; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // and "level+1" will be merged to produce a set of "level+1" files. + int level() const { return level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "key". + bool ShouldStopBefore(const InternalKey& key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + private: + friend class Version; + friend class VersionSet; + + explicit Compaction(int level); + + int level_; + uint64_t max_output_file_size_; + Version* input_version_; + VersionEdit edit_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + size_t grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + int64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + + // State for implementing IsBaseLevelForKey + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + size_t level_ptrs_[config::kNumLevels]; +}; + +} + +#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/leveldb/db/write_batch.cc b/leveldb/db/write_batch.cc new file mode 100644 index 0000000..d561528 --- /dev/null +++ b/leveldb/db/write_batch.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring | +// kTypeDeletion varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "leveldb/write_batch.h" + +#include "leveldb/db.h" +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "util/coding.h" + +namespace leveldb { + +WriteBatch::WriteBatch() { + Clear(); +} + +WriteBatch::~WriteBatch() { } + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(12); +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +void WriteBatch::Put(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeValue)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatch::Delete(const Slice& key) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeDeletion)); + PutLengthPrefixedSlice(&rep_, key); +} + +Status WriteBatchInternal::InsertInto(const WriteBatch* b, + MemTable* memtable) { + const int count = WriteBatchInternal::Count(b); + int found = 0; + Iterator it(*b); + for (; !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeDeletion: + memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); + break; + case kTypeValue: + memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); + break; + } + found++; + } + if (!it.status().ok()) { + return it.status(); + } else if (found != count) { + return Status::Corruption("wrong count in WriteBatch"); + } + return Status::OK(); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= 12); + b->rep_.assign(contents.data(), contents.size()); +} + +WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) + : input_(WriteBatchInternal::Contents(&batch)), + done_(false) { + if (input_.size() < 12) { + done_ = true; + } else { + seq_ = WriteBatchInternal::Sequence(&batch), + input_.remove_prefix(12); + GetNextEntry(); + } +} + +void WriteBatchInternal::Iterator::Next() { + assert(!done_); + seq_++; + GetNextEntry(); +} + +void WriteBatchInternal::Iterator::GetNextEntry() { + if (input_.empty()) { + done_ = true; + return; + } + char tag = input_[0]; + input_.remove_prefix(1); + switch (tag) { + case kTypeValue: + if (GetLengthPrefixedSlice(&input_, &key_) && + GetLengthPrefixedSlice(&input_, &value_)) { + op_ = static_cast(tag); + } else { + status_ = Status::Corruption("bad WriteBatch Put"); + done_ = true; + input_.clear(); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input_, &key_)) { + op_ = kTypeDeletion; + } else { + status_ = Status::Corruption("bad WriteBatch Delete"); + done_ = true; + input_.clear(); + } + break; + default: + status_ = Status::Corruption("unknown WriteBatch tag"); + done_ = true; + input_.clear(); + break; + } +} + +} diff --git a/leveldb/db/write_batch_internal.h b/leveldb/db/write_batch_internal.h new file mode 100644 index 0000000..ab0a823 --- /dev/null +++ b/leveldb/db/write_batch_internal.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ +#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ + +#include "leveldb/write_batch.h" + +namespace leveldb { + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + // Return the seqeunce number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the seqeunce number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + static Status InsertInto(const WriteBatch* batch, MemTable* memtable); + + // Iterate over the contents of a write batch. + class Iterator { + public: + explicit Iterator(const WriteBatch& batch); + bool Done() const { return done_; } + void Next(); + ValueType op() const { return op_; } + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + SequenceNumber sequence_number() const { return seq_; } + Status status() const { return status_; } + + private: + void GetNextEntry(); + + Slice input_; + bool done_; + ValueType op_; + Slice key_; + Slice value_; + SequenceNumber seq_; + Status status_; + }; +}; + +} + + +#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/leveldb/db/write_batch_test.cc b/leveldb/db/write_batch_test.cc new file mode 100644 index 0000000..2bf1134 --- /dev/null +++ b/leveldb/db/write_batch_test.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "leveldb/env.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +static std::string PrintContents(WriteBatch* b) { + InternalKeyComparator cmp(BytewiseComparator()); + MemTable mem(cmp); + std::string state; + Status s = WriteBatchInternal::InsertInto(b, &mem); + Iterator* iter = mem.NewIterator(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append("ParseError()"); + } + return state; +} + +class WriteBatchTest { }; + +TEST(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); +} + +TEST(WriteBatchTest, Multiple) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + batch.Put(Slice("baz"), Slice("boo")); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@102" + "Delete(box)@101" + "Put(foo, bar)@100", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, Corruption) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + WriteBatchInternal::SetContents(&batch, + Slice(contents.data(),contents.size()-1)); + ASSERT_EQ("Put(foo, bar)@200" + "ParseError()", + PrintContents(&batch)); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/doc/doc.css b/leveldb/doc/doc.css new file mode 100644 index 0000000..700c564 --- /dev/null +++ b/leveldb/doc/doc.css @@ -0,0 +1,89 @@ +body { + margin-left: 0.5in; + margin-right: 0.5in; + background: white; + color: black; +} + +h1 { + margin-left: -0.2in; + font-size: 14pt; +} +h2 { + margin-left: -0in; + font-size: 12pt; +} +h3 { + margin-left: -0in; +} +h4 { + margin-left: -0in; +} +hr { + margin-left: -0in; +} + +/* Definition lists: definition term bold */ +dt { + font-weight: bold; +} + +address { + text-align: center; +} +code,samp,var { + color: blue; +} +kbd { + color: #600000; +} +div.note p { + float: right; + width: 3in; + margin-right: 0%; + padding: 1px; + border: 2px solid #6060a0; + background-color: #fffff0; +} + +ul { + margin-top: -0em; + margin-bottom: -0em; +} + +ol { + margin-top: -0em; + margin-bottom: -0em; +} + +UL.nobullets { + list-style-type: none; + list-style-image: none; + margin-left: -1em; +} + +p { + margin: 1em 0 1em 0; + padding: 0 0 0 0; +} + +pre { + line-height: 1.3em; + padding: 0.4em 0 0.8em 0; + margin: 0 0 0 0; + border: 0 0 0 0; + color: blue; +} + +.datatable { + margin-left: auto; + margin-right: auto; + margin-top: 2em; + margin-bottom: 2em; + border: 1px solid; +} + +.datatable td,th { + padding: 0 0.5em 0 0.5em; + text-align: right; +} diff --git a/leveldb/doc/impl.html b/leveldb/doc/impl.html new file mode 100644 index 0000000..dd09fea --- /dev/null +++ b/leveldb/doc/impl.html @@ -0,0 +1,217 @@ + + + + +Leveldb file layout and compactions + + + + +

Files

+ +The implementation of leveldb is similar in spirit to the +representation of a single + +Bigtable tablet (section 5.3). +However the organization of the files that make up the representation +is somewhat different and is explained below. + +

+Each database is represented by a set of file stored in a directory. +There are several different types of files as documented below: +

+

Log files

+

+A log file (*.log) stores a sequence of recent updates. Each update +is appended to the current log file. When the log file reaches a +pre-determined size (approximately 1MB by default), it is converted +to a sorted table (see below) and a new log file is created for future +updates. +

+A copy of the current log file is kept in an in-memory structure (the +memtable). This copy is consulted on every read so that read +operations reflect all logged updates. +

+

Sorted tables

+

+A sorted table (*.sst) stores a sequence of entries sorted by key. +Each entry is either a value for the key, or a deletion marker for the +key. (Deletion markers are kept around to hide obsolete values +present in older sorted tables). +

+The set of sorted tables are organized into a sequence of levels. The +sorted table generated from a log file is placed in a special young +level (also called level-0). When the number of young files exceeds a +certain threshold (currently four), all of the young files are merged +together with all of the overlapping level-1 files to produce a +sequence of new level-1 files (we create a new level-1 file for every +2MB of data.) +

+Files in the young level may contain overlapping keys. However files +in other levels have distinct non-overlapping key ranges. Consider +level number L where L >= 1. When the combined size of files in +level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, +...), one file in level-L, and all of the overlapping files in +level-(L+1) are merged to form a set of new files for level-(L+1). +These merges have the effect of gradually migrating new updates from +the young level to the largest level using only bulk reads and writes +(i.e., minimizing expensive seeks). + +

Manifest

+

+A MANIFEST file lists the set of sorted tables that make up each +level, the corresponding key ranges, and other important metadata. +A new MANIFEST file (with a new number embedded in the file name) +is created whenever the database is reopened. The MANIFEST file is +formatted as a log, and changes made to the serving state (as files +are added or removed) are appended to this log. +

+

Current

+

+CURRENT is a simple text file that contains the name of the latest +MANIFEST file. +

+

Info logs

+

+Informational messages are printed to files named LOG and LOG.old. +

+

Others

+

+Other files used for miscellaneous purposes may also be present +(LOCK, *.dbtmp). + +

Level 0

+When the log file grows above a certain size (1MB by default): +
    +
  • Write the contents of the current memtable to an sstable +
  • Replace the current memtable by a brand new empty memtable +
  • Switch to a new log file +
  • Delete the old log file and the old memtable +
+Experimental measurements show that generating an sstable from a 1MB +log file takes ~12ms, which seems like an acceptable latency hiccup to +add infrequently to a log write. + +

+The new sstable is added to a special level-0 level. level-0 contains +a set of files (up to 4 by default). However unlike other levels, +these files do not cover disjoint ranges, but may overlap each other. + +

Compactions

+ +

+When the size of level L exceeds its limit, we compact it in a +background thread. The compaction picks a file from level L and all +overlapping files from the next level L+1. Note that if a level-L +file overlaps only part of a level-(L+1) file, the entire file at +level-(L+1) is used as an input to the compaction and will be +discarded after the compaction. Aside: because level-0 is special +(files in it may overlap each other), we treat compactions from +level-0 to level-1 specially: a level-0 compaction may pick more than +one level-0 file in case some of these files overlap each other. + +

+A compaction merges the contents of the picked files to produce a +sequence of level-(L+1) files. We switch to producing a new +level-(L+1) file after the current output file has reached the target +file size (2MB). We also switch to a new output file when the key +range of the current output file has grown enough to overlap more then +ten level-(L+2) files. This last rule ensures that a later compaction +of a level-(L+1) file will not pick up too much data from level-(L+2). + +

+The old files are discarded and the new files are added to the serving +state. + +

+Compactions for a particular level rotate through the key space. In +more detail, for each level L, we remember the ending key of the last +compaction at level L. The next compaction for level L will pick the +first file that starts after this key (wrapping around to the +beginning of the key space if there is no such file). + +

+Compactions drop overwritten values. They also drop deletion markers +if there are no higher numbered levels that contain a file whose range +overlaps the current key. + +

Timing

+ +Level-0 compactions will read up to four 1MB files from level-0, and +at worst all the level-1 files (10MB). I.e., we will read 14MB and +write 14MB. + +

+Other than the special level-0 compactions, we will pick one 2MB file +from level L. In the worst case, this will overlap ~ 12 files from +level L+1 (10 because level-(L+1) is ten times the size of level-L, +and another two at the boundaries since the file ranges at level-L +will usually not be aligned with the file ranges at level-L+1). The +compaction will therefore read 26MB and write 26MB. Assuming a disk +IO rate of 100MB/s (ballpark range for modern drives), the worst +compaction cost will be approximately 0.5 second. + +

+If we throttle the background writing to something small, say 10% of +the full 100MB/s speed, a compaction may take up to 5 seconds. If the +user is writing at 10MB/s, we might build up lots of level-0 files +(~50 to hold the 5*10MB). This may signficantly increase the cost of +reads due to the overhead of merging more files together on every +read. + +

+Solution 1: To reduce this problem, we might want to increase the log +switching threshold when the number of level-0 files is large. Though +the downside is that the larger this threshold, the larger the delay +that we will add to write latency when a write triggers a log switch. + +

+Solution 2: We might want to decrease write rate artificially when the +number of level-0 files goes up. + +

+Solution 3: We work on reducing the cost of very wide merges. +Perhaps most of the level-0 files will have their blocks sitting +uncompressed in the cache and we will only need to worry about the +O(N) complexity in the merging iterator. + +

Number of files

+ +Instead of always making 2MB files, we could make larger files for +larger levels to reduce the total file count, though at the expense of +more bursty compactions. Alternatively, we could shard the set of +files into multiple directories. + +

+An experiment on an ext3 filesystem on Feb 04, 2011 shows +the following timings to do 100K file opens in directories with +varying number of files: + + + + + +
Files in directoryMicroseconds to open a file
10009
1000010
10000016
+So maybe even the sharding is not necessary on modern filesystems? + +

Recovery

+ +
    +
  • Read CURRENT to find name of the latest committed MANIFEST +
  • Read the named MANIFEST file +
  • Clean up stale files +
  • We could open all sstables here, but it is probably better to be lazy... +
  • Convert log chunk to a new level-0 sstable +
  • Start directing new writes to a new log file with recovered sequence# +
+ +

Garbage collection of files

+ +DeleteObsoleteFiles() is called at the end of every +compaction and at the end of recovery. It finds the names of all +files in the database. It deletes all log files that are not the +current log file. It deletes all table files that are not referenced +from some level and are not the output of an active compaction. + + + diff --git a/leveldb/doc/index.html b/leveldb/doc/index.html new file mode 100644 index 0000000..c2312b7 --- /dev/null +++ b/leveldb/doc/index.html @@ -0,0 +1,498 @@ + + + + +Leveldb + + + +

Leveldb

+
Jeff Dean, Sanjay Ghemawat
+

+The leveldb library provides a persistent key value store. Keys and +values are arbitrary byte arrays. The keys are ordered within the key +value store according to a user-specified comparator function. + +

+

Opening A Database

+

+A leveldb database has a name which corresponds to a file system +directory. All of the contents of database are stored in this +directory. The following example shows how to open a database, +creating it if necessary: +

+

+  #include <assert>
+  #include "leveldb/include/db.h"
+
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+If you want to raise an error if the database already exists, add +the following line before the leveldb::DB::Open call: +
+  options.error_if_exists = true;
+
+

Status

+

+You may have noticed the leveldb::Status type above. Values of this +type are returned by most functions in leveldb that may encounter an +error. You can check if such a result is ok, and also print an +associated error message: +

+

+   leveldb::Status s = ...;
+   if (!s.ok()) cerr << s.ToString() << endl;
+
+

Closing A Database

+

+When you are done with a database, just delete the database object. +Example: +

+

+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+
+

Reads And Writes

+

+The database provides Put, Delete, and Get methods to +modify/query the database. For example, the following code +moves the value stored under key1 to key2. +

+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
+  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
+
+ +

Atomic Updates

+

+Note that if the process dies after the Put of key2 but before the +delete of key1, the same value may be left stored under multiple keys. +Such problems can be avoided by using the WriteBatch class to +atomically apply a set of updates: +

+

+  #include "leveldb/include/write_batch.h"
+  ...
+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) {
+    leveldb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db->Write(leveldb::WriteOptions(), &batch);
+  }
+
+The WriteBatch holds a sequence of edits to be made to the database, +and these edits within the batch are applied in order. Note that we +called Delete before Put so that if key1 is identical to key2, +we do not end up erroneously dropping the value entirely. +

+Apart from its atomicity benefits, WriteBatch may also be used to +speed up bulk updates by placing lots of individual mutations into the +same batch. + +

Synchronous Writes

+By default, each write to leveldb is asynchronous: it +returns after pushing the write from the process into the operating +system. The transfer from operating system memory to the underlying +persistent storage happens asynchronously. The sync flag +can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling +either fsync(...) or fdatasync(...) or +msync(..., MS_SYNC) before the write operation returns.) +
+  leveldb::WriteOptions write_options;
+  write_options.sync = true;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a thousand times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can often be used safely. For example, when +loading a large amount of data into the database you can handle lost +updates by restarting the bulk load after a crash. A hybrid scheme is +also possible where every Nth write is synchronous, and in the event +of a crash, the bulk load is restarted just after the last synchronous +write finished by the previous run. (The synchronous write can update +a marker that describes where to restart on a crash.) + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write (i.e., +write_options.sync is set to true). The extra cost of +the synchronous write will be amortized across all of the writes in +the batch. + +

+

Concurrency

+

+A database may only be opened by one process at a time. The leveldb +implementation acquires a lock from the operating system to prevent +misuse. Within a single process, the same leveldb::DB object may +be safely used by multiple concurrent threads. +

+

Iteration

+

+The following example demonstrates how to print all key,value pairs +in a database. +

+

+  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+  delete it;
+
+The following variation shows how to process just the keys in the +range [start,limit): +

+

+  for (it->Seek(start);
+       it->Valid() && it->key().ToString() < limit;
+       it->Next()) {
+    ...
+  }
+
+You can also process entries in reverse order. (Caveat: reverse +iteration may be somewhat slower than forward iteration.) +

+

+  for (it->SeekToLast(); it->Valid(); it->Prev()) {
+    ...
+  }
+
+

Snapshots

+

+Snapshots provide consistent read-only views over the entire state of +the key-value store. ReadOptions::snapshot may be non-NULL to indicate +that a read should operate on a particular version of the DB state. +If ReadOptions::snapshot is NULL, the read will operate on an +implicit snapshot of the current state. +

+Snapshots typically are created by the DB::GetSnapshot() method: +

+

+  leveldb::ReadOptions options;
+  options.snapshot = db->GetSnapshot();
+  ... apply some updates to db ...
+  leveldb::Iterator* iter = db->NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db->ReleaseSnapshot(options.snapshot);
+
+Note that when a snapshot is no longer needed, it should be released +using the DB::ReleaseSnapshot interface. This allows the +implementation to get rid of state that was being maintained just to +support reading as of that snapshot. +

+A Write operation can also return a snapshot that +represents the state of the database just after applying a particular +set of updates: +

+

+  leveldb::Snapshot* snapshot;
+  leveldb::WriteOptions write_options;
+  write_options.post_write_snapshot = &snapshot;
+  leveldb::Status status = db->Write(write_options, ...);
+  ... perform other mutations to db ...
+
+  leveldb::ReadOptions read_options;
+  read_options.snapshot = snapshot;
+  leveldb::Iterator* iter = db->NewIterator(read_options);
+  ... read as of the state just after the Write call returned ...
+  delete iter;
+
+  db->ReleaseSnapshot(snapshot);
+
+

Slice

+

+The return value of the it->key() and it->value() calls above +are instances of the leveldb::Slice type. Slice is a simple +structure that contains a length and a pointer to an external byte +array. Returning a Slice is a cheaper alternative to returning a +std::string since we do not need to copy potentially large keys and +values. In addition, leveldb methods do not return null-terminated +C-style strings since leveldb keys and values are allowed to +contain '\0' bytes. +

+C++ strings and null-terminated C-style strings can be easily converted +to a Slice: +

+

+   leveldb::Slice s1 = "hello";
+
+   std::string str("world");
+   leveldb::Slice s2 = str;
+
+A Slice can be easily converted back to a C++ string: +
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+
+Be careful when using Slices since it is up to the caller to ensure that +the external byte array into which the Slice points remains live while +the Slice is in use. For example, the following is buggy: +

+

+   leveldb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+
+When the if statement goes out of scope, str will be destroyed and the +backing storage for slice will disappear. +

+

Comparators

+

+The preceding examples used the default ordering function for key, +which orders bytes lexicographically. You can however supply a custom +comparator when opening a database. For example, suppose each +database key consists of two numbers and we should sort by the first +number, breaking ties by the second number. First, define a proper +subclass of leveldb::Comparator that expresses these rules: +

+

+  class TwoPartComparator : public leveldb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a < b: negative result
+    //   if a > b: positive result
+    //   else: zero result
+    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &a1, &a2);
+      ParseKey(b, &b1, &b2);
+      if (a1 < b1) return -1;
+      if (a1 > b1) return +1;
+      if (a2 < b2) return -1;
+      if (a2 > b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+
+Now create a database using this custom comparator: +

+

+  TwoPartComparator cmp;
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &cmp;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  ...
+
+

Backwards compatibility

+

+The result of the comparator's Name method is attached to the +database when it is created, and is checked on every subsequent +database open. If the name changes, the leveldb::DB::Open call will +fail. Therefore, change the name if and only if the new key format +and comparison function are incompatible with existing databases, and +it is ok to discard the contents of all existing databases. +

+You can however still gradually evolve your key format over time with +a little bit of pre-planning. For example, you could store a version +number at the end of each key (one byte should suffice for most uses). +When you wish to switch to a new key format (e.g., adding an optional +third part to the keys processed by TwoPartComparator), +(a) keep the same comparator name (b) increment the version number +for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. +

+

Performance

+

+Performance can be tuned by changing the default values of the +types defined in leveldb/include/options.h. + +

+

Block size

+

+leveldb groups adjacent keys together into the same block and such a +block is the unit of transfer to and from persistent storage. The +default block size is approximately 4096 uncompressed bytes. +Applications that mostly do bulk scans over the contents of the +database may wish to increase this size. Applications that do a lot +of point reads of small values may wish to switch to a smaller block +size if performance measurements indicate an improvement. There isn't +much benefit in using blocks smaller than one kilobyte, or larger than +a few megabytes. Also note that compression will be more effective +with larger block sizes. +

+

Compression

+

+Each block is individually compressed before being written to +persistent storage. Compression is on by default since the default +compression method is very fast, and is automatically disabled for +uncompressible data. In rare cases, applications may want to disable +compression entirely, but should only do so if benchmarks show a +performance improvement: +

+

+  leveldb::Options options;
+  options.compression = leveldb::kNoCompression;
+  ... leveldb::DB::Open(options, name, ...) ....
+
+

Cache

+

+The contents of the database are stored in a set of files in the +filesystem and each file stores a sequence of compressed blocks. If +options.cache is non-NULL, it is used to cache frequently used +uncompressed block contents. +

+

+  #include "leveldb/include/cache.h"
+
+  leveldb::Options options;
+  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
+  leveldb::DB* db;
+  leveldb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.cache;
+
+Note that the cache holds uncompressed data, and therefore it should +be sized according to application level data sizes, without any +reduction from compression. (Caching of compressed blocks is left to +the operating system buffer cache, or any custom Env +implementation provided by the client.) +

+When performing a bulk read, the application may wish to disable +caching so that the data processed by the bulk read does not end up +displacing most of the cached contents. A per-iterator option can be +used to achieve this: +

+

+  leveldb::ReadOptions options;
+  options.fill_cache = false;
+  leveldb::Iterator* it = db->NewIterator(options);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ...
+  }
+
+

Key Layout

+

+Note that the unit of disk transfer and caching is a block. Adjacent +keys (according to the database sort order) will usually be placed in +the same block. Therefore the application can improve its performance +by placing keys that are accessed together near each other and placing +infrequently used keys in a separate region of the key space. +

+For example, suppose we are implementing a simple file system on top +of leveldb. The types of entries we might wish to store are: +

+

+   filename -> permission-bits, length, list of file_block_ids
+   file_block_id -> data
+
+We might want to prefix filename keys with one letter (say '/') and the +file_block_id keys with a different letter (say '0') so that scans +over just the metadata do not force us to fetch and cache bulky file +contents. +

+

Checksums

+

+leveldb associates checksums with all data it stores in the file system. +There are two separate controls provided over how aggressively these +checksums are verified: +

+

    +
  • ReadOptions::verify_checksums may be set to true to force + checksum verification of all data that is read from the file system on + behalf of a particular read. By default, no such verification is + done. +

    +

  • Options::paranoid_checks may be set to true before opening a + database to make the database implementation raise an error as soon as + it detects an internal corruption. Depending on which portion of the + database has been corrupted, the error may be raised when the database + is opened, or later by another database operation. By default, + paranoid checking is off so that the database can be used even if + parts of its persistent storage have been corrupted. +

    + If a database is corrupted (perhaps it cannot be opened when + paranoid checking is turned on), the leveldb::RepairDB function + may be used to recover as much of the data as possible +

    +

+

Approximate Sizes

+

+The GetApproximateSizes method can used to get the approximate +number of bytes of file system space used by one or more key ranges. +

+

+   leveldb::Range ranges[2];
+   ranges[0] = leveldb::Range("a", "c");
+   ranges[1] = leveldb::Range("x", "z");
+   uint64_t sizes[2];
+   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
+
+The preceding call will set sizes[0] to the approximate number of +bytes of file system space used by the key range [a..c) and +sizes[1] to the approximate number of bytes used by the key range +[x..z). +

+

Environment

+

+All file operations (and other operating system calls) issued by the +leveldb implementation are routed through a leveldb::Env object. +Sophisticated clients may wish to provide their own Env +implementation to get better control. For example, an application may +introduce artificial delays in the file IO paths to limit the impact +of leveldb on other activities in the system. +

+

+  class SlowEnv : public leveldb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  leveldb::Options options;
+  options.env = &env;
+  Status s = leveldb::DB::Open(options, ...);
+
+

Porting

+

+leveldb may be ported to a new platform by providing platform +specific implementations of the types/methods/functions exported by +leveldb/port/port.h. See leveldb/port/port_example.h for more +details. +

+In addition, the new platform may need a new default leveldb::Env +implementation. See leveldb/util/env_posix.h for an example. + +

Other Information

+ +

+Details about the leveldb implementation may be found in +the following documents: +

+ + + diff --git a/leveldb/doc/log_format.txt b/leveldb/doc/log_format.txt new file mode 100644 index 0000000..3a0414b --- /dev/null +++ b/leveldb/doc/log_format.txt @@ -0,0 +1,75 @@ +The log file contents are a sequence of 32KB blocks. The only +exception is that the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] + length: uint16 + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last six bytes of a block (since it +won't fit). Any leftover bytes here form the trailer, which must +consist entirely of zero bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new +non-zero length record is added, the writer must emit a FIRST record +(which contains zero bytes of user data) to fill up the trailing seven +bytes of the block and then emit all of the user data in subsequent +blocks. + +More types may be added in the future. Some Readers may skip record +types they do not understand, others may report that some data was +skipped. + +FULL == 1 +FIRST == 2 +MIDDLE == 3 +LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been +split into multiple fragments (typically because of block boundaries). +FIRST is the type of the first fragment of a user record, LAST is the +type of the last fragment of a user record, and MID is the type of all +interior fragments of a user record. + +Example: consider a sequence of user records: + A: length 1000 + B: length 97270 + C: length 8000 +A will be stored as a FULL record in the first block. + +B will be split into three fragments: first fragment occupies the rest +of the first block, second fragment occupies the entirety of the +second block, and the third fragment occupies a prefix of the third +block. This will leave six bytes free in the third block, which will +be left empty as the trailer. + +C will be stored as a FULL record in the fourth block. + +=================== + +Some benefits over the recordio format: + +(1) We do not need any heuristics for resyncing - just go to next +block boundary and scan. If there is a corruption, skip to the next +block. As a side-benefit, we do not get confused when part of the +contents of one log file are embedded as a record inside another log +file. + +(2) Splitting at approximate boundaries (e.g., for mapreduce) is +simple: find the next block boundary and skip records until we +hit a FULL or FIRST record. + +(3) We do not need extra buffering for large records. + +Some downsides compared to recordio format: + +(1) No packing of tiny records. This could be fixed by adding a new +record type, so it is a shortcoming of the current implementation, +not necessarily the format. + +(2) No compression. Again, this could be fixed by adding new record types. diff --git a/leveldb/doc/table_format.txt b/leveldb/doc/table_format.txt new file mode 100644 index 0000000..ad5aa4b --- /dev/null +++ b/leveldb/doc/table_format.txt @@ -0,0 +1,61 @@ +File format +=========== + + + [data block 1] + [data block 2] + ... + [data block N] + [meta block 1] + ... + [meta block K] + [metaindex block] + [index block] + [Footer] (fixed size; starts at file_size - sizeof(Footer)) + + +The file contains internal pointers. Each such pointer is called +a BlockHandle and contains the following information: + offset: varint64 + size: varint64 + +(1) The sequence of key/value pairs in the file are stored in sorted +order and partitioned into a sequence of data blocks. These blocks +come one after another at the beginning of the file. Each data block +is formatted according to the code in block_builder.cc, and then +optionally compressed. + +(2) After the data blocks we store a bunch of meta blocks. The +supported meta block types are described below. More meta block types +may be added in the future. Each meta block is again formatted using +block_builder.cc and then optionally compressed. + +(3) A "metaindex" block. It contains one entry for every other meta +block where the key is the name of the meta block and the value is a +BlockHandle pointing to that meta block. + +(4) An "index" block. This block contains one entry per data block, +where the key is a string >= last key in that data block and before +the first key in the successive data block. The value is the +BlockHandle for the data block. + +(6) At the very end of the file is a fixed length footer that contains +the BlockHandle of the metaindex and index blocks as well as a magic number. + metaindex_handle: char[p]; // Block handle for metaindex + index_handle: char[q]; // Block handle for index + padding: char[40-p-q]; // 0 bytes to make fixed length + // (40==2*BlockHandle::kMaxEncodedLength) + magic: fixed64; // == 0xdb4775248b80fb57 + +"stats" Meta Block +------------------ + +This meta block contains a bunch of stats. The key is the name +of the statistic. The value contains the statistic. +TODO(postrelease): record following stats. + data size + index size + key size (uncompressed) + value size (uncompressed) + number of entries + number of data blocks diff --git a/leveldb/include/leveldb/cache.h b/leveldb/include/leveldb/cache.h new file mode 100644 index 0000000..79196d1 --- /dev/null +++ b/leveldb/include/leveldb/cache.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ +#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ + +#include +#include "leveldb/slice.h" + +namespace leveldb { + +class Cache; + +// Create a new cache with a fixed size capacity. This implementation +// of Cache uses a least-recently-used eviction policy. +extern Cache* NewLRUCache(size_t capacity); + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) = 0; + + // If the cache has no mapping for "key", returns NULL. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + private: + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; + + // No copying allowed + Cache(const Cache&); + void operator=(const Cache&); +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/leveldb/include/leveldb/comparator.h b/leveldb/include/leveldb/comparator.h new file mode 100644 index 0000000..4e00e4d --- /dev/null +++ b/leveldb/include/leveldb/comparator.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ + +#include + +namespace leveldb { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. +class Comparator { + public: + virtual ~Comparator(); + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "leveldb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/leveldb/include/leveldb/db.h b/leveldb/include/leveldb/db.h new file mode 100644 index 0000000..f18ded3 --- /dev/null +++ b/leveldb/include/leveldb/db.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ +#define STORAGE_LEVELDB_INCLUDE_DB_H_ + +#include +#include +#include "leveldb/iterator.h" +#include "leveldb/options.h" + +namespace leveldb { + +static const int kMajorVersion = 1; +static const int kMinorVersion = 1; + +struct Options; +struct ReadOptions; +struct WriteOptions; + +class Snapshot; +class WriteBatch; + +// Some internal types. Clients should ignore. +class WriteBatchInternal; + +struct Range { + Slice start; + Slice limit; + + Range(const Slice& s, const Slice& l) : start(s), limit(l) { } +}; + +// A DB is a persistent ordered map from keys to values. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores NULL in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, + const std::string& name, + DB** dbptr); + + DB() { } + virtual ~DB(); + + // Set the database entry for "key" to "value". Returns OK on success, + // and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; + + // Apply the specified updates to the database. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) = 0; + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // DB implementations can export properties about their state + // via this method. If "property" is a valid property understood by this + // DB implementation, fills "*value" with its current value and returns + // true. Otherwise returns false. + // + // + // Valid property names include: + // + // "leveldb.num-files-at-level" - return the number of files at level , + // where is an ASCII representation of a level number (e.g. "0"). + // "leveldb.stats" - returns a multi-line string that describes statistics + // about the internal operation of the DB. + virtual bool GetProperty(const Slice& property, std::string* value) = 0; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + // + // The results may not include the sizes of recently written data. + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) = 0; + + // Possible extensions: + // (1) Add a method to compact a range of keys + + private: + // No copying allowed + DB(const DB&); + void operator=(const DB&); +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options); + +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +Status RepairDB(const std::string& dbname, const Options& options); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/leveldb/include/leveldb/env.h b/leveldb/include/leveldb/env.h new file mode 100644 index 0000000..4b6e712 --- /dev/null +++ b/leveldb/include/leveldb/env.h @@ -0,0 +1,290 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the leveldb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. + +#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ +#define STORAGE_LEVELDB_INCLUDE_ENV_H_ + +#include +#include +#include +#include +#include "leveldb/status.h" + +namespace leveldb { + +class FileLock; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; + +class Env { + public: + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to leveldb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores NULL in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores NULL in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // Arrange to run "(*function)(arg)" once in a background thread. + // + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + virtual void Schedule( + void (*function)(void* arg), + void* arg) = 0; + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Write an entry to the log file with the specified format. + virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). If an error was encountered, returns a + // non-OK status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() { } + virtual ~WritableFile(); + + virtual Status Append(const Slice& data) = 0; + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; + + private: + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + +// Log the specified data to *info_log if info_log is non-NULL. +extern void Log(Env* env, WritableFile* info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 3, 4))) +# endif + ; + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *target + explicit EnvWrapper(Env* target) : target_(target) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, SequentialFile** r) { + return target_->NewSequentialFile(f, r); + } + Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { + return target_->NewRandomAccessFile(f, r); + } + Status NewWritableFile(const std::string& f, WritableFile** r) { + return target_->NewWritableFile(f, r); + } + bool FileExists(const std::string& f) { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector* r) { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) { return target_->CreateDir(d); } + Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } + Status GetFileSize(const std::string& f, uint64_t* s) { + return target_->GetFileSize(f, s); + } + Status RenameFile(const std::string& s, const std::string& t) { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } + void Schedule(void (*f)(void*), void* a) { + return target_->Schedule(f, a); + } + void StartThread(void (*f)(void*), void* a) { + return target_->StartThread(f, a); + } + virtual Status GetTestDirectory(std::string* path) { + return target_->GetTestDirectory(path); + } + virtual void Logv(WritableFile* log, const char* format, va_list ap) { + return target_->Logv(log, format, ap); + } + uint64_t NowMicros() { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) { + target_->SleepForMicroseconds(micros); + } + private: + Env* target_; +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/leveldb/include/leveldb/iterator.h b/leveldb/include/leveldb/iterator.h new file mode 100644 index 0000000..1866fb5 --- /dev/null +++ b/leveldb/include/leveldb/iterator.h @@ -0,0 +1,95 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. + +#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ + +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class Iterator { + public: + Iterator(); + virtual ~Iterator(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + + // No copying allowed + Iterator(const Iterator&); + void operator=(const Iterator&); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/leveldb/include/leveldb/options.h b/leveldb/include/leveldb/options.h new file mode 100644 index 0000000..a94651f --- /dev/null +++ b/leveldb/include/leveldb/options.h @@ -0,0 +1,198 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ +#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ + +#include + +namespace leveldb { + +class Cache; +class Comparator; +class Env; +class Snapshot; +class WritableFile; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1, +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be to written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + WritableFile* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. + // + // Default: 4MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // Create an Options object with default values for all fields. + Options(); +}; + +// Options that control read operations +struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: false + bool verify_checksums; + + // Should the data read for this iteration be cached in memory? + // Callers may wish to set this field to false for bulk scans. + // Default: true + bool fill_cache; + + // If "snapshot" is non-NULL, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is NULL, use an impliicit + // snapshot of the state at the beginning of this read operation. + // Default: NULL + const Snapshot* snapshot; + + ReadOptions() + : verify_checksums(false), + fill_cache(true), + snapshot(NULL) { + } +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fsync()". + // + // Default: false + bool sync; + + // If "post_write_snapshot" is non-NULL, and the write succeeds, + // *post_write_snapshot will be modified to point to a snapshot of + // the DB state immediately after this write. The caller must call + // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the + // snapshot is no longer needed. + // + // If "post_write_snapshot" is non-NULL, and the write fails, + // *post_write_snapshot will be set to NULL. + // + // Default: NULL + const Snapshot** post_write_snapshot; + + WriteOptions() + : sync(false), + post_write_snapshot(NULL) { + } +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/leveldb/include/leveldb/slice.h b/leveldb/include/leveldb/slice.h new file mode 100644 index 0000000..62cb894 --- /dev/null +++ b/leveldb/include/leveldb/slice.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. + +#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ +#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ + +#include +#include +#include +#include + +namespace leveldb { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) { } + + // Create a slice that refers to data[0,n-1]. + Slice(const char* data, size_t n) : data_(data), size_(n) { } + + // Create a slice that refers to the contents of "s" + Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + Slice(const char* s) : data_(s), size_(strlen(s)) { } + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { data_ = ""; size_ = 0; } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + std::string ToString() const { return std::string(data_, size_); } + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_, x.data_, x.size_) == 0)); + } + + private: + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline int Slice::compare(const Slice& b) const { + const int min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +} + + +#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/leveldb/include/leveldb/status.h b/leveldb/include/leveldb/status.h new file mode 100644 index 0000000..47e3edf --- /dev/null +++ b/leveldb/include/leveldb/status.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. + +#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ +#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ + +#include +#include +#include "leveldb/slice.h" + +namespace leveldb { + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, Slice()); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + private: + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + }; + Code code() const { return (state_ == NULL) ? kOk : state_->first; } + + Status(Code code, const Slice& msg, const Slice& msg2); + + typedef std::pair State; + State* state_; +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); +} +inline void Status::operator=(const Status& s) { + if (this != &s) { + delete state_; + state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); + } +} + +} + +#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/leveldb/include/leveldb/table.h b/leveldb/include/leveldb/table.h new file mode 100644 index 0000000..bd99176 --- /dev/null +++ b/leveldb/include/leveldb/table.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ +#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ + +#include +#include "leveldb/iterator.h" + +namespace leveldb { + +class Block; +class BlockHandle; +struct Options; +class RandomAccessFile; +struct ReadOptions; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. +class Table { + public: + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to NULL and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, + RandomAccessFile* file, + uint64_t file_size, + Table** table); + + ~Table(); + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + Iterator* NewIterator(const ReadOptions&) const; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key) const; + + private: + struct Rep; + Rep* rep_; + + explicit Table(Rep* rep) { rep_ = rep; } + static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); + + // No copying allowed + Table(const Table&); + void operator=(const Table&); +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ diff --git a/leveldb/include/leveldb/table_builder.h b/leveldb/include/leveldb/table_builder.h new file mode 100644 index 0000000..49d2d51 --- /dev/null +++ b/leveldb/include/leveldb/table_builder.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). + +#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ +#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ + +#include +#include "leveldb/options.h" +#include "leveldb/status.h" + +namespace leveldb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; + +class TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + TableBuilder(const Options& options, WritableFile* file); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~TableBuilder(); + + // Change the options used by this builder. Note: only some of the + // option fields can be changed after construction. If a field is + // not allowed to change dynamically and its value in the structure + // passed to the constructor is different from its value in the + // structure passed to this method, this method will return an error + // without changing any fields. + Status ChangeOptions(const Options& options); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value); + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // Return non-ok iff some error has been detected. + Status status() const; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish(); + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon(); + + // Number of calls to Add() so far. + uint64_t NumEntries() const; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const; + + private: + bool ok() const { return status().ok(); } + void WriteBlock(BlockBuilder* block, BlockHandle* handle); + + struct Rep; + Rep* rep_; + + // No copying allowed + TableBuilder(const TableBuilder&); + void operator=(const TableBuilder&); +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/leveldb/include/leveldb/write_batch.h b/leveldb/include/leveldb/write_batch.h new file mode 100644 index 0000000..3411952 --- /dev/null +++ b/leveldb/include/leveldb/write_batch.h @@ -0,0 +1,49 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); + +#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ +#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ + +#include + +namespace leveldb { + +class Slice; + +class WriteBatch { + public: + WriteBatch(); + ~WriteBatch(); + + // Store the mapping "key->value" in the database. + void Put(const Slice& key, const Slice& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + void Delete(const Slice& key); + + // Clear all updates buffered in this batch. + void Clear(); + + private: + friend class WriteBatchInternal; + + std::string rep_; // See comment in write_batch.cc for the format of rep_ + + // Intentionally copyable +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/leveldb/leveldb.gyp b/leveldb/leveldb.gyp new file mode 100644 index 0000000..20d1b1d --- /dev/null +++ b/leveldb/leveldb.gyp @@ -0,0 +1,315 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +{ + 'variables': { + 'use_snappy%': 0, + }, + 'target_defaults': { + 'defines': [ + 'LEVELDB_PLATFORM_CHROMIUM=1', + ], + 'include_dirs': [ + '.', + 'include/', + ], + 'conditions': [ + ['OS == "win"', { + 'include_dirs': [ + 'port/win', + ], + }], + ['use_snappy', { + 'defines': [ + 'USE_SNAPPY=1', + ], + }], + ], + }, + 'targets': [ + { + 'target_name': 'leveldb', + 'type': '<(library)', + 'dependencies': [ + # The base libary is a lightweight abstraction layer for things like + # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ + '../../base/base.gyp:base', + ], + 'conditions': [ + ['use_snappy', { + 'dependencies': [ + '../../third_party/snappy/snappy.gyp:snappy', + ], + }], + ], + 'direct_dependent_settings': { + 'include_dirs': [ + 'include/', + ], + }, + 'sources': [ + # Include and then exclude so that all files show up in IDEs, even if + # they don't build. + 'db/builder.cc', + 'db/builder.h', + 'db/db_impl.cc', + 'db/db_impl.h', + 'db/db_iter.cc', + 'db/db_iter.h', + 'db/filename.cc', + 'db/filename.h', + 'db/dbformat.cc', + 'db/dbformat.h', + 'db/log_format.h', + 'db/log_reader.cc', + 'db/log_reader.h', + 'db/log_writer.cc', + 'db/log_writer.h', + 'db/memtable.cc', + 'db/memtable.h', + 'db/repair.cc', + 'db/skiplist.h', + 'db/snapshot.h', + 'db/table_cache.cc', + 'db/table_cache.h', + 'db/version_edit.cc', + 'db/version_edit.h', + 'db/version_set.cc', + 'db/version_set.h', + 'db/write_batch.cc', + 'db/write_batch_internal.h', + 'include/leveldb/cache.h', + 'include/leveldb/comparator.h', + 'include/leveldb/db.h', + 'include/leveldb/env.h', + 'include/leveldb/iterator.h', + 'include/leveldb/options.h', + 'include/leveldb/slice.h', + 'include/leveldb/status.h', + 'include/leveldb/table.h', + 'include/leveldb/table_builder.h', + 'include/leveldb/write_batch.h', + 'port/port.h', + 'port/port_chromium.cc', + 'port/port_chromium.h', + 'port/port_example.h', + 'port/port_posix.cc', + 'port/port_posix.h', + 'table/block.cc', + 'table/block.h', + 'table/block_builder.cc', + 'table/block_builder.h', + 'table/format.cc', + 'table/format.h', + 'table/iterator.cc', + 'table/iterator_wrapper.h', + 'table/merger.cc', + 'table/merger.h', + 'table/table.cc', + 'table/table_builder.cc', + 'table/two_level_iterator.cc', + 'table/two_level_iterator.h', + 'util/arena.cc', + 'util/arena.h', + 'util/cache.cc', + 'util/coding.cc', + 'util/coding.h', + 'util/comparator.cc', + 'util/crc32c.cc', + 'util/crc32c.h', + 'util/env.cc', + 'util/env_chromium.cc', + 'util/env_posix.cc', + 'util/hash.cc', + 'util/hash.h', + 'util/logging.cc', + 'util/logging.h', + 'util/mutexlock.h', + 'util/options.cc', + 'util/random.h', + 'util/status.cc', + ], + 'sources/': [ + ['exclude', '_(android|example|portable|posix)\\.cc$'], + ], + }, + { + 'target_name': 'leveldb_testutil', + 'type': '<(library)', + 'dependencies': [ + '../../base/base.gyp:base', + 'leveldb', + ], + 'export_dependent_settings': [ + # The tests use include directories from these projects. + '../../base/base.gyp:base', + 'leveldb', + ], + 'sources': [ + 'util/histogram.cc', + 'util/histogram.h', + 'util/testharness.cc', + 'util/testharness.h', + 'util/testutil.cc', + 'util/testutil.h', + ], + }, + { + 'target_name': 'leveldb_arena_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/arena_test.cc', + ], + }, + { + 'target_name': 'leveldb_cache_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/cache_test.cc', + ], + }, + { + 'target_name': 'leveldb_coding_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/coding_test.cc', + ], + }, + { + 'target_name': 'leveldb_corruption_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/corruption_test.cc', + ], + }, + { + 'target_name': 'leveldb_crc32c_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/crc32c_test.cc', + ], + }, + { + 'target_name': 'leveldb_db_bench', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/db_bench.cc', + ], + }, + { + 'target_name': 'leveldb_db_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/db_test.cc', + ], + }, + { + 'target_name': 'leveldb_dbformat_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/dbformat_test.cc', + ], + }, + { + 'target_name': 'leveldb_env_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/env_test.cc', + ], + }, + { + 'target_name': 'leveldb_filename_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/filename_test.cc', + ], + }, + { + 'target_name': 'leveldb_log_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/log_test.cc', + ], + }, + { + 'target_name': 'leveldb_skiplist_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/skiplist_test.cc', + ], + }, + { + 'target_name': 'leveldb_table_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'table/table_test.cc', + ], + }, + { + 'target_name': 'leveldb_version_edit_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/version_edit_test.cc', + ], + }, + { + 'target_name': 'leveldb_write_batch_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/write_batch_test.cc', + ], + }, + ], +} + +# Local Variables: +# tab-width:2 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/leveldb/port/README b/leveldb/port/README new file mode 100644 index 0000000..422563e --- /dev/null +++ b/leveldb/port/README @@ -0,0 +1,10 @@ +This directory contains interfaces and implementations that isolate the +rest of the package from platform details. + +Code in the rest of the package includes "port.h" from this directory. +"port.h" in turn includes a platform specific "port_.h" file +that provides the platform specific implementation. + +See port_posix.h for an example of what must be provided in a platform +specific header file. + diff --git a/leveldb/port/port.h b/leveldb/port/port.h new file mode 100644 index 0000000..816826b --- /dev/null +++ b/leveldb/port/port.h @@ -0,0 +1,21 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_PORT_H_ +#define STORAGE_LEVELDB_PORT_PORT_H_ + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(LEVELDB_PLATFORM_POSIX) +# include "port/port_posix.h" +#elif defined(LEVELDB_PLATFORM_CHROMIUM) +# include "port/port_chromium.h" +#elif defined(LEVELDB_PLATFORM_ANDROID) +# include "port/port_android.h" +#endif + +#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/leveldb/port/port_android.cc b/leveldb/port/port_android.cc new file mode 100644 index 0000000..240e9ca --- /dev/null +++ b/leveldb/port/port_android.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_android.h" + +#include + +extern "C" { +size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { + return fread(a, b, c, d); +} + +size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { + return fwrite(a, b, c, d); +} + +int fflush_unlocked(FILE *f) { + return fflush(f); +} + +int fdatasync(int fd) { + return fsync(fd); +} +} + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { + PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); +} + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal(){ + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/leveldb/port/port_android.h b/leveldb/port/port_android.h new file mode 100644 index 0000000..13df9c9 --- /dev/null +++ b/leveldb/port/port_android.h @@ -0,0 +1,150 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ +#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ + +#include +#include +#include +#include +#include +#include + +// Collapse the plethora of ARM flavors available to an easier to manage set +// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto +#if defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7A__) +#define ARMV6_OR_7 1 +#endif + +extern "C" { + size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); + size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); + int fflush_unlocked(FILE *f); + int fdatasync (int fd); +} + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { + //TODO(gabor): How can I implement this? + } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + Mutex* mu_; + pthread_cond_t cv_; +}; + +#ifndef ARMV6_OR_7 +// On ARM chipsets = V6 +#ifdef ARMV6_OR_7 + __asm__ __volatile__("dmb" : : : "memory"); +#else + pLinuxKernelMemoryBarrier(); +#endif + } + + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + void* r = rep_; + MemoryBarrier(); + return r; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + rep_ = v; + } + inline void* NoBarrier_Load() const { + void* r = rep_; + return r; + } + inline void NoBarrier_Store(void* v) { + rep_ = v; + } +}; + +// TODO(gabor): Implement compress +inline bool Snappy_Compress( + const char* input, + size_t input_length, + std::string* output) { + return false; +} + +// TODO(gabor): Implement uncompress +inline bool Snappy_Uncompress( + const char* input_data, + size_t input_length, + std::string* output) { + return false; +} + +inline uint64_t ThreadIdentifier() { + pthread_t tid = pthread_self(); + uint64_t r = 0; + memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); + return r; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/leveldb/port/port_chromium.cc b/leveldb/port/port_chromium.cc new file mode 100644 index 0000000..2ab49b9 --- /dev/null +++ b/leveldb/port/port_chromium.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_chromium.h" + +#include "util/logging.h" + +#if defined(USE_SNAPPY) +# include "third_party/snappy/src/snappy.h" +#endif + +namespace leveldb { +namespace port { + +Mutex::Mutex() { +} + +Mutex::~Mutex() { +} + +void Mutex::Lock() { + mu_.Acquire(); +} + +void Mutex::Unlock() { + mu_.Release(); +} + +void Mutex::AssertHeld() { + mu_.AssertAcquired(); +} + +CondVar::CondVar(Mutex* mu) + : cv_(&mu->mu_) { +} + +CondVar::~CondVar() { } + +void CondVar::Wait() { + cv_.Wait(); +} + +void CondVar::Signal(){ + cv_.Signal(); +} + +void CondVar::SignalAll() { + cv_.Broadcast(); +} + +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { +#if defined(USE_SNAPPY) + output->resize(snappy::MaxCompressedLength(input_length)); + size_t outlen; + snappy::RawCompress(input, input_length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#else + return false; +#endif +} + +bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { +#if defined(USE_SNAPPY) + size_t ulength; + if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { + return false; + } + output->resize(ulength); + return snappy::RawUncompress(input_data, input_length, &(*output)[0]); +#else + return false; +#endif +} + +} +} diff --git a/leveldb/port/port_chromium.h b/leveldb/port/port_chromium.h new file mode 100644 index 0000000..1851e6e --- /dev/null +++ b/leveldb/port/port_chromium.h @@ -0,0 +1,97 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ +#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ + +#include +#include +#include +#include "base/atomicops.h" +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/synchronization/condition_variable.h" +#include "base/synchronization/lock.h" + +// Linux's ThreadIdentifier() needs this. +#if defined(OS_LINUX) +# include +#endif + +#if defined(OS_WIN) +#define snprintf _snprintf +#define va_copy(a, b) do { (a) = (b); } while (0) +#endif + +namespace leveldb { +namespace port { + +// Chromium only supports little endian. +static const bool kLittleEndian = true; + +class Mutex { + public: + Mutex(); + ~Mutex(); + void Lock(); + void Unlock(); + void AssertHeld(); + + private: + base::Lock mu_; + + friend class CondVar; + DISALLOW_COPY_AND_ASSIGN(Mutex); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + + private: + base::ConditionVariable cv_; + + DISALLOW_COPY_AND_ASSIGN(CondVar); +}; + +class AtomicPointer { + private: + typedef base::subtle::AtomicWord Rep; + Rep rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} + inline void* Acquire_Load() const { + return reinterpret_cast(::base::subtle::Acquire_Load(&rep_)); + } + inline void Release_Store(void* v) { + ::base::subtle::Release_Store(&rep_, reinterpret_cast(v)); + } + inline void* NoBarrier_Load() const { + return reinterpret_cast(::base::subtle::NoBarrier_Load(&rep_)); + } + inline void NoBarrier_Store(void* v) { + ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast(v)); + } +}; + +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); +bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output); + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ diff --git a/leveldb/port/port_example.h b/leveldb/port/port_example.h new file mode 100644 index 0000000..8a624f3 --- /dev/null +++ b/leveldb/port/port_example.h @@ -0,0 +1,115 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ +#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ + +namespace leveldb { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// A type that holds a pointer that can be read or written atomically +// (i.e., without word-tearing.) +class AtomicPointer { + private: + intptr_t rep_; + public: + // Initialize to arbitrary value + AtomicPointer(); + + // Initialize to hold v + explicit AtomicPointer(void* v) : rep_(v) { } + + // Read and return the stored pointer with the guarantee that no + // later memory access (read or write) by this thread can be + // reordered ahead of this read. + void* Acquire_Load() const; + + // Set v as the stored pointer with the guarantee that no earlier + // memory access (read or write) by this thread can be reordered + // after this store. + void Release_Store(void* v); + + // Read the stored pointer with no ordering guarantees. + void* NoBarrier_Load() const; + + // Set va as the stored pointer with no ordering guarantees. + void NoBarrier_Store(void* v); +}; + +// ------------------ Compression ------------------- + +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); + +// Attempt to snappy uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output); + +// ------------------ Miscellaneous ------------------- + +// If heap profiling is not supported, returns false. +// Else repeatedly calls (*func)(arg, data, n) and then returns true. +// The concatenation of all "data[0,n-1]" fragments is the heap profile. +extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/leveldb/port/port_posix.cc b/leveldb/port/port_posix.cc new file mode 100644 index 0000000..e75da8b --- /dev/null +++ b/leveldb/port/port_posix.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_posix.h" + +#include +#include +#include +#include "util/logging.h" + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } + +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/leveldb/port/port_posix.h b/leveldb/port/port_posix.h new file mode 100644 index 0000000..c158db1 --- /dev/null +++ b/leveldb/port/port_posix.h @@ -0,0 +1,94 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ + +#include +#include +#include +#include +#include +#include + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +// Storage for a lock-free pointer +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +// TODO(gabor): Implement actual compress +inline bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { + return false; +} + +// TODO(gabor): Implement actual uncompress +inline bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { + return false; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/leveldb/port/win/stdint.h b/leveldb/port/win/stdint.h new file mode 100644 index 0000000..39edd0d --- /dev/null +++ b/leveldb/port/win/stdint.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// MSVC didn't ship with this file until the 2010 version. + +#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ +#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ + +#if !defined(_MSC_VER) +#error This file should only be included when compiling with MSVC. +#endif + +// Define C99 equivalent types. +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long long int64_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/leveldb/table/block.cc b/leveldb/table/block.cc new file mode 100644 index 0000000..92b2877 --- /dev/null +++ b/leveldb/table/block.cc @@ -0,0 +1,263 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block.h" + +#include +#include +#include "leveldb/comparator.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +inline uint32_t Block::NumRestarts() const { + assert(size_ >= 2*sizeof(uint32_t)); + return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); +} + +Block::Block(const char* data, size_t size) + : data_(data), + size_(size) { + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + } +} + +Block::~Block() { + delete[] data_; +} + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns NULL. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +static inline const char* DecodeEntry(const char* p, const char* limit, + uint32_t* shared, + uint32_t* non_shared, + uint32_t* value_length) { + if (limit - p < 3) return NULL; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; + if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; + } + + if (static_cast(limit - p) < (*non_shared + *value_length)) { + return NULL; + } + return p; +} + +class Block::Iter : public Iterator { + private: + const Comparator* const comparator_; + const char* const data_; // underlying block contents + uint32_t const restarts_; // Offset of restart array (list of fixed32) + uint32_t const num_restarts_; // Number of uint32_t entries in restart array + + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + uint32_t restart_index_; // Index of restart block in which current_ falls + std::string key_; + Slice value_; + Status status_; + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + return (value_.data() + value_.size()) - data_; + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + public: + Iter(const Comparator* comparator, + const char* data, + uint32_t restarts, + uint32_t num_restarts) + : comparator_(comparator), + data_(data), + restarts_(restarts), + num_restarts_(num_restarts), + current_(restarts_), + restart_index_(num_restarts_) { + assert(num_restarts_ > 0); + } + + virtual bool Valid() const { return current_ < restarts_; } + virtual Status status() const { return status_; } + virtual Slice key() const { + assert(Valid()); + return key_; + } + virtual Slice value() const { + assert(Valid()); + return value_; + } + + virtual void Next() { + assert(Valid()); + ParseNextKey(); + } + + virtual void Prev() { + assert(Valid()); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + do { + // Loop until end of current entry hits the start of original entry + } while (ParseNextKey() && NextEntryOffset() < original); + } + + virtual void Seek(const Slice& target) { + // Binary search in restart array to find the first restart point + // with a key >= target + uint32_t left = 0; + uint32_t right = num_restarts_ - 1; + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared, value_length; + const char* key_ptr = DecodeEntry(data_ + region_offset, + data_ + restarts_, + &shared, &non_shared, &value_length); + if (key_ptr == NULL || (shared != 0)) { + CorruptionError(); + return; + } + Slice mid_key(key_ptr, non_shared); + if (Compare(mid_key, target) < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } + } + + // Linear search (within restart block) for first key >= target + SeekToRestartPoint(left); + while (true) { + if (!ParseNextKey()) { + return; + } + if (Compare(key_, target) >= 0) { + return; + } + } + } + + virtual void SeekToFirst() { + SeekToRestartPoint(0); + ParseNextKey(); + } + + virtual void SeekToLast() { + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } + } + + private: + void CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.clear(); + value_.clear(); + } + + bool ParseNextKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (p == NULL || key_.size() < shared) { + CorruptionError(); + return false; + } else { + key_.resize(shared); + key_.append(p, non_shared); + value_ = Slice(p + non_shared, value_length); + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + return true; + } + } +}; + +Iterator* Block::NewIterator(const Comparator* cmp) { + if (size_ < 2*sizeof(uint32_t)) { + return NewErrorIterator(Status::Corruption("bad block contents")); + } + const uint32_t num_restarts = NumRestarts(); + if (num_restarts == 0) { + return NewEmptyIterator(); + } else { + return new Iter(cmp, data_, restart_offset_, num_restarts); + } +} + +} diff --git a/leveldb/table/block.h b/leveldb/table/block.h new file mode 100644 index 0000000..cdf0598 --- /dev/null +++ b/leveldb/table/block.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ +#define STORAGE_LEVELDB_TABLE_BLOCK_H_ + +#include +#include +#include "leveldb/iterator.h" + +namespace leveldb { + +class Comparator; + +class Block { + public: + // Initialize the block with the specified contents. + // Takes ownership of data[] and will delete[] it when done. + Block(const char* data, size_t size); + + ~Block(); + + size_t size() const { return size_; } + Iterator* NewIterator(const Comparator* comparator); + + private: + uint32_t NumRestarts() const; + + const char* data_; + size_t size_; + uint32_t restart_offset_; // Offset in data_ of restart array + + // No copying allowed + Block(const Block&); + void operator=(const Block&); + + class Iter; +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ diff --git a/leveldb/table/block_builder.cc b/leveldb/table/block_builder.cc new file mode 100644 index 0000000..dc958c8 --- /dev/null +++ b/leveldb/table/block_builder.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_builder.h" + +#include +#include +#include "leveldb/comparator.h" +#include "leveldb/table_builder.h" +#include "util/coding.h" + +namespace leveldb { + +BlockBuilder::BlockBuilder(const Options* options) + : options_(options), + restarts_(), + counter_(0), + finished_(false) { + assert(options->block_restart_interval >= 1); + restarts_.push_back(0); // First restart point is at offset 0 +} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + counter_ = 0; + finished_ = false; + last_key_.clear(); +} + +size_t BlockBuilder::CurrentSizeEstimate() const { + return (buffer_.size() + // Raw data buffer + restarts_.size() * sizeof(uint32_t) + // Restart array + sizeof(uint32_t)); // Restart array length +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (size_t i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + PutFixed32(&buffer_, restarts_.size()); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value) { + Slice last_key_piece(last_key_); + assert(!finished_); + assert(counter_ <= options_->block_restart_interval); + assert(buffer_.empty() // No values yet? + || options_->comparator->Compare(key, last_key_piece) > 0); + size_t shared = 0; + if (counter_ < options_->block_restart_interval) { + // See how much sharing to do with previous string + const size_t min_length = std::min(last_key_piece.size(), key.size()); + while ((shared < min_length) && (last_key_[shared] == key[shared])) { + shared++; + } + } else { + // Restart compression + restarts_.push_back(buffer_.size()); + counter_ = 0; + } + const size_t non_shared = key.size() - shared; + + // Add "" to buffer_ + PutVarint32(&buffer_, shared); + PutVarint32(&buffer_, non_shared); + PutVarint32(&buffer_, value.size()); + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + buffer_.append(value.data(), value.size()); + + // Update state + last_key_.resize(shared); + last_key_.append(key.data() + shared, non_shared); + assert(Slice(last_key_) == key); + counter_++; +} + +} diff --git a/leveldb/table/block_builder.h b/leveldb/table/block_builder.h new file mode 100644 index 0000000..bf92a0f --- /dev/null +++ b/leveldb/table/block_builder.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ +#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ + +#include + +#include +#include "leveldb/slice.h" + +namespace leveldb { + +struct Options; + +class BlockBuilder { + public: + explicit BlockBuilder(const Options* options); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been callled since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + size_t CurrentSizeEstimate() const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { + return buffer_.empty(); + } + + private: + const Options* options_; + std::string buffer_; // Destination buffer + std::vector restarts_; // Restart points + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + + // No copying allowed + BlockBuilder(const BlockBuilder&); + void operator=(const BlockBuilder&); +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ diff --git a/leveldb/table/format.cc b/leveldb/table/format.cc new file mode 100644 index 0000000..63971db --- /dev/null +++ b/leveldb/table/format.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include "leveldb/env.h" +#include "port/port.h" +#include "table/block.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, offset_); + PutVarint64(dst, size_); +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && + GetVarint64(input, &size_)) { + return Status::OK(); + } else { + return Status::Corruption("bad block handle"); + } +} + +void Footer::EncodeTo(std::string* dst) const { +#ifndef NDEBUG + const size_t original_size = dst->size(); +#endif + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding + PutFixed32(dst, static_cast(kTableMagicNumber & 0xffffffffu)); + PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); + assert(dst->size() == original_size + kEncodedLength); +} + +Status Footer::DecodeFrom(Slice* input) { + const char* magic_ptr = input->data() + kEncodedLength - 8; + const uint32_t magic_lo = DecodeFixed32(magic_ptr); + const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); + const uint64_t magic = ((static_cast(magic_hi) << 32) | + (static_cast(magic_lo))); + if (magic != kTableMagicNumber) { + return Status::InvalidArgument("not an sstable (bad magic number)"); + } + + Status result = metaindex_handle_.DecodeFrom(input); + if (result.ok()) { + result = index_handle_.DecodeFrom(input); + } + if (result.ok()) { + // We skip over any leftover data (just padding for now) in "input" + const char* end = magic_ptr + 8; + *input = Slice(end, input->data() + input->size() - end); + } + return result; +} + +Status ReadBlock(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** block) { + *block = NULL; + + // Read the block contents as well as the type/crc footer. + // See table_builder.cc for the code that built this structure. + size_t n = static_cast(handle.size()); + char* buf = new char[n + kBlockTrailerSize]; + Slice contents; + Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); + if (!s.ok()) { + delete[] buf; + return s; + } + if (contents.size() != n + kBlockTrailerSize) { + delete[] buf; + return Status::Corruption("truncated block read"); + } + + // Check the crc of the type and the block contents + const char* data = contents.data(); // Pointer to where Read put the data + if (options.verify_checksums) { + const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); + const uint32_t actual = crc32c::Value(data, n + 1); + if (actual != crc) { + delete[] buf; + s = Status::Corruption("block checksum mismatch"); + return s; + } + } + + switch (data[n]) { + case kNoCompression: + if (data != buf) { + // File implementation gave us pointer to some other data. + // Copy into buf[]. + memcpy(buf, data, n + kBlockTrailerSize); + } + + // Ok + break; + case kSnappyCompression: { + std::string decompressed; + if (!port::Snappy_Uncompress(data, n, &decompressed)) { + delete[] buf; + s = Status::Corruption("corrupted compressed block contents"); + return s; + } + delete[] buf; // Done with uncompressed data + buf = new char[decompressed.size()]; + memcpy(buf, decompressed.data(), decompressed.size()); + n = decompressed.size(); + break; + } + default: + delete[] buf; + return Status::Corruption("bad block type"); + } + + *block = new Block(buf, n); // Block takes ownership of buf[] + return Status::OK(); +} + +} diff --git a/leveldb/table/format.h b/leveldb/table/format.h new file mode 100644 index 0000000..a6ab964 --- /dev/null +++ b/leveldb/table/format.h @@ -0,0 +1,103 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ +#define STORAGE_LEVELDB_TABLE_FORMAT_H_ + +#include +#include +#include "leveldb/slice.h" +#include "leveldb/status.h" +#include "leveldb/table_builder.h" + +namespace leveldb { + +class Block; +class RandomAccessFile; +struct ReadOptions; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + BlockHandle(); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t offset) { offset_ = offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t size) { size_ = size; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Maximum encoding length of a BlockHandle + enum { kMaxEncodedLength = 10 + 10 }; + + private: + uint64_t offset_; + uint64_t size_; +}; + +// Footer encapsulates the fixed information stored at the tail +// end of every table file. +class Footer { + public: + Footer() { } + + // The block handle for the metaindex block of the table + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } + + // The block handle for the index block of the table + const BlockHandle& index_handle() const { + return index_handle_; + } + void set_index_handle(const BlockHandle& h) { + index_handle_ = h; + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Encoded length of a Footer. Note that the serialization of a + // Footer will always occupy exactly this many bytes. It consists + // of two block handles and a magic number. + enum { + kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 + }; + + private: + BlockHandle metaindex_handle_; + BlockHandle index_handle_; +}; + +// kTableMagicNumber was picked by running +// echo http://code.google.com/p/leveldb/ | sha1sum +// and taking the leading 64 bits. +static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; + +// 1-byte type + 32-bit crc +static const size_t kBlockTrailerSize = 5; + +// Read the block identified by "handle" from "file". On success, +// store a pointer to the heap-allocated result in *block and return +// OK. On failure store NULL in *block and return non-OK. +extern Status ReadBlock(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** block); + +// Implementation details follow. Clients should ignore, + +inline BlockHandle::BlockHandle() + : offset_(~static_cast(0)), + size_(~static_cast(0)) { +} + +} + +#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ diff --git a/leveldb/table/iterator.cc b/leveldb/table/iterator.cc new file mode 100644 index 0000000..4ddd55f --- /dev/null +++ b/leveldb/table/iterator.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/iterator.h" +#include "util/logging.h" + +namespace leveldb { + +Iterator::Iterator() { + cleanup_.function = NULL; + cleanup_.next = NULL; +} + +Iterator::~Iterator() { + if (cleanup_.function != NULL) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != NULL; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != NULL); + Cleanup* c; + if (cleanup_.function == NULL) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +} diff --git a/leveldb/table/iterator_wrapper.h b/leveldb/table/iterator_wrapper.h new file mode 100644 index 0000000..158d3a7 --- /dev/null +++ b/leveldb/table/iterator_wrapper.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ +#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ + +namespace leveldb { + +// A internal wrapper class with an interface similar to Iterator that +// caches the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +class IteratorWrapper { + private: + Iterator* iter_; + bool valid_; + Slice key_; + public: + IteratorWrapper(): iter_(NULL), valid_(false) { } + explicit IteratorWrapper(Iterator* iter): iter_(NULL) { + Set(iter); + } + ~IteratorWrapper() { delete iter_; } + Iterator* iter() const { return iter_; } + + // Takes ownership of "iter" and will delete it when destroyed, or + // when Set() is invoked again. + void Set(Iterator* iter) { + delete iter_; + iter_ = iter; + if (iter_ == NULL) { + valid_ = false; + } else { + Update(); + } + } + + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + Slice value() const { assert(Valid()); return iter_->value(); } + // Methods below require iter() != NULL + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + key_ = iter_->key(); + } + } +}; + +} + + +#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ diff --git a/leveldb/table/merger.cc b/leveldb/table/merger.cc new file mode 100644 index 0000000..6ce06bb --- /dev/null +++ b/leveldb/table/merger.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merger.h" + +#include "leveldb/comparator.h" +#include "leveldb/iterator.h" +#include "table/iterator_wrapper.h" + +namespace leveldb { + +namespace { +class MergingIterator : public Iterator { + public: + MergingIterator(const Comparator* comparator, Iterator** children, int n) + : comparator_(comparator), + children_(new IteratorWrapper[n]), + n_(n), + current_(NULL), + direction_(kForward) { + for (int i = 0; i < n; i++) { + children_[i].Set(children[i]); + } + } + + virtual ~MergingIterator() { + delete[] children_; + } + + virtual bool Valid() const { + return (current_ != NULL); + } + + virtual void SeekToFirst() { + for (int i = 0; i < n_; i++) { + children_[i].SeekToFirst(); + } + FindSmallest(); + direction_ = kForward; + } + + virtual void SeekToLast() { + for (int i = 0; i < n_; i++) { + children_[i].SeekToLast(); + } + FindLargest(); + direction_ = kReverse; + } + + virtual void Seek(const Slice& target) { + for (int i = 0; i < n_; i++) { + children_[i].Seek(target); + } + FindSmallest(); + direction_ = kForward; + } + + virtual void Next() { + assert(Valid()); + + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all of the non-current_ children since current_ is + // the smallest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kForward) { + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child != current_) { + child->Seek(key()); + if (child->Valid() && + comparator_->Compare(key(), child->key()) == 0) { + child->Next(); + } + } + } + direction_ = kForward; + } + + current_->Next(); + FindSmallest(); + } + + virtual void Prev() { + assert(Valid()); + + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all of the non-current_ children since current_ is + // the largest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kReverse) { + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child != current_) { + child->Seek(key()); + if (child->Valid()) { + // Child is at first entry >= key(). Step back one to be < key() + child->Prev(); + } else { + // Child has no entries >= key(). Position at last entry. + child->SeekToLast(); + } + } + } + direction_ = kReverse; + } + + current_->Prev(); + FindLargest(); + } + + virtual Slice key() const { + assert(Valid()); + return current_->key(); + } + + virtual Slice value() const { + assert(Valid()); + return current_->value(); + } + + virtual Status status() const { + Status status; + for (int i = 0; i < n_; i++) { + status = children_[i].status(); + if (!status.ok()) { + break; + } + } + return status; + } + + private: + void FindSmallest(); + void FindLargest(); + + // We might want to use a heap in case there are lots of children. + // For now we use a simple array since we expect a very small number + // of children in leveldb. + const Comparator* comparator_; + IteratorWrapper* children_; + int n_; + IteratorWrapper* current_; + + // Which direction is the iterator moving? + enum Direction { + kForward, + kReverse + }; + Direction direction_; +}; + +void MergingIterator::FindSmallest() { + IteratorWrapper* smallest = NULL; + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child->Valid()) { + if (smallest == NULL) { + smallest = child; + } else if (comparator_->Compare(child->key(), smallest->key()) < 0) { + smallest = child; + } + } + } + current_ = smallest; +} + +void MergingIterator::FindLargest() { + IteratorWrapper* largest = NULL; + for (int i = n_-1; i >= 0; i--) { + IteratorWrapper* child = &children_[i]; + if (child->Valid()) { + if (largest == NULL) { + largest = child; + } else if (comparator_->Compare(child->key(), largest->key()) > 0) { + largest = child; + } + } + } + current_ = largest; +} +} + +Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { + assert(n >= 0); + if (n == 0) { + return NewEmptyIterator(); + } else if (n == 1) { + return list[0]; + } else { + return new MergingIterator(cmp, list, n); + } +} + +} diff --git a/leveldb/table/merger.h b/leveldb/table/merger.h new file mode 100644 index 0000000..71d9dc5 --- /dev/null +++ b/leveldb/table/merger.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ +#define STORAGE_LEVELDB_TABLE_MERGER_H_ + +namespace leveldb { + +class Comparator; +class Iterator; + +// Return an iterator that provided the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern Iterator* NewMergingIterator( + const Comparator* comparator, Iterator** children, int n); + +} + +#endif // STORAGE_LEVELDB_TABLE_MERGER_H_ diff --git a/leveldb/table/table.cc b/leveldb/table/table.cc new file mode 100644 index 0000000..9820753 --- /dev/null +++ b/leveldb/table/table.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table.h" + +#include "leveldb/cache.h" +#include "leveldb/env.h" +#include "table/block.h" +#include "table/format.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" + +namespace leveldb { + +struct Table::Rep { + ~Rep() { + delete index_block; + } + + Options options; + Status status; + RandomAccessFile* file; + uint64_t cache_id; + + BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer + Block* index_block; +}; + +Status Table::Open(const Options& options, + RandomAccessFile* file, + uint64_t size, + Table** table) { + *table = NULL; + if (size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + char footer_space[Footer::kEncodedLength]; + Slice footer_input; + Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, + &footer_input, footer_space); + if (!s.ok()) return s; + + Footer footer; + s = footer.DecodeFrom(&footer_input); + if (!s.ok()) return s; + + // Read the index block + Block* index_block = NULL; + if (s.ok()) { + s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); + } + + if (s.ok()) { + // We've successfully read the footer and the index block: we're + // ready to serve requests. + Rep* rep = new Table::Rep; + rep->options = options; + rep->file = file; + rep->metaindex_handle = footer.metaindex_handle(); + rep->index_block = index_block; + rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); + *table = new Table(rep); + } else { + if (index_block) delete index_block; + } + + return s; +} + +Table::~Table() { + delete rep_; +} + +static void DeleteBlock(void* arg, void* ignored) { + delete reinterpret_cast(arg); +} + +static void DeleteCachedBlock(const Slice& key, void* value) { + Block* block = reinterpret_cast(value); + delete block; +} + +static void ReleaseBlock(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle); +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +Iterator* Table::BlockReader(void* arg, + const ReadOptions& options, + const Slice& index_value) { + Table* table = reinterpret_cast(arg); + Cache* block_cache = table->rep_->options.block_cache; + Block* block = NULL; + Cache::Handle* cache_handle = NULL; + + BlockHandle handle; + Slice input = index_value; + Status s = handle.DecodeFrom(&input); + // We intentionally allow extra stuff in index_value so that we + // can add more features in the future. + + if (s.ok()) { + if (block_cache != NULL) { + char cache_key_buffer[16]; + EncodeFixed64(cache_key_buffer, table->rep_->cache_id); + EncodeFixed64(cache_key_buffer+8, handle.offset()); + Slice key(cache_key_buffer, sizeof(cache_key_buffer)); + cache_handle = block_cache->Lookup(key); + if (cache_handle != NULL) { + block = reinterpret_cast(block_cache->Value(cache_handle)); + } else { + s = ReadBlock(table->rep_->file, options, handle, &block); + if (s.ok() && options.fill_cache) { + cache_handle = block_cache->Insert( + key, block, block->size(), &DeleteCachedBlock); + } + } + } else { + s = ReadBlock(table->rep_->file, options, handle, &block); + } + } + + Iterator* iter; + if (block != NULL) { + iter = block->NewIterator(table->rep_->options.comparator); + if (cache_handle == NULL) { + iter->RegisterCleanup(&DeleteBlock, block, NULL); + } else { + iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); + } + } else { + iter = NewErrorIterator(s); + } + return iter; +} + +Iterator* Table::NewIterator(const ReadOptions& options) const { + return NewTwoLevelIterator( + rep_->index_block->NewIterator(rep_->options.comparator), + &Table::BlockReader, const_cast(this), options); +} + +uint64_t Table::ApproximateOffsetOf(const Slice& key) const { + Iterator* index_iter = + rep_->index_block->NewIterator(rep_->options.comparator); + index_iter->Seek(key); + uint64_t result; + if (index_iter->Valid()) { + BlockHandle handle; + Slice input = index_iter->value(); + Status s = handle.DecodeFrom(&input); + if (s.ok()) { + result = handle.offset(); + } else { + // Strange: we can't decode the block handle in the index block. + // We'll just return the offset of the metaindex block, which is + // close to the whole file size for this case. + result = rep_->metaindex_handle.offset(); + } + } else { + // key is past the last key in the file. Approximate the offset + // by returning the offset of the metaindex block (which is + // right near the end of the file). + result = rep_->metaindex_handle.offset(); + } + delete index_iter; + return result; +} + +} diff --git a/leveldb/table/table_builder.cc b/leveldb/table/table_builder.cc new file mode 100644 index 0000000..7ec7ad2 --- /dev/null +++ b/leveldb/table/table_builder.cc @@ -0,0 +1,227 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table_builder.h" + +#include +#include +#include "leveldb/comparator.h" +#include "leveldb/env.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/logging.h" + +namespace leveldb { + +struct TableBuilder::Rep { + Options options; + Options index_block_options; + WritableFile* file; + uint64_t offset; + Status status; + BlockBuilder data_block; + BlockBuilder index_block; + std::string last_key; + int64_t num_entries; + bool closed; // Either Finish() or Abandon() has been called. + + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + // + // Invariant: r->pending_index_entry is true only if data_block is empty. + bool pending_index_entry; + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + + Rep(const Options& opt, WritableFile* f) + : options(opt), + index_block_options(opt), + file(f), + offset(0), + data_block(&options), + index_block(&index_block_options), + num_entries(0), + closed(false), + pending_index_entry(false) { + index_block_options.block_restart_interval = 1; + } +}; + +TableBuilder::TableBuilder(const Options& options, WritableFile* file) + : rep_(new Rep(options, file)) { +} + +TableBuilder::~TableBuilder() { + assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_; +} + +Status TableBuilder::ChangeOptions(const Options& options) { + // Note: if more fields are added to Options, update + // this function to catch changes that should not be allowed to + // change in the middle of building a Table. + if (options.comparator != rep_->options.comparator) { + return Status::InvalidArgument("changing comparator while building table"); + } + + // Note that any live BlockBuilders point to rep_->options and therefore + // will automatically pick up the updated options. + rep_->options = options; + rep_->index_block_options = options; + rep_->index_block_options.block_restart_interval = 1; + return Status::OK(); +} + +void TableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->num_entries > 0) { + assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); + } + + if (r->pending_index_entry) { + assert(r->data_block.empty()); + r->options.comparator->FindShortestSeparator(&r->last_key, key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + r->pending_index_entry = false; + } + + r->last_key.assign(key.data(), key.size()); + r->num_entries++; + r->data_block.Add(key, value); + + const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); + if (estimated_block_size >= r->options.block_size) { + Flush(); + } +} + +void TableBuilder::Flush() { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->data_block.empty()) return; + assert(!r->pending_index_entry); + WriteBlock(&r->data_block, &r->pending_handle); + if (ok()) { + r->pending_index_entry = true; + r->status = r->file->Flush(); + } +} + +void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + Slice raw = block->Finish(); + + Slice block_contents; + CompressionType type = r->options.compression; + // TODO(postrelease): Support more compression options: zlib? + switch (type) { + case kNoCompression: + block_contents = raw; + break; + + case kSnappyCompression: { + std::string* compressed = &r->compressed_output; + if (port::Snappy_Compress(raw.data(), raw.size(), compressed) && + compressed->size() < raw.size() - (raw.size() / 8u)) { + block_contents = *compressed; + } else { + // Snappy not supported, or compressed less than 12.5%, so just + // store uncompressed form + block_contents = raw; + type = kNoCompression; + } + break; + } + } + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type + EncodeFixed32(trailer+1, crc32c::Mask(crc)); + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + } + } + r->compressed_output.clear(); + block->Reset(); +} + +Status TableBuilder::status() const { + return rep_->status; +} + +Status TableBuilder::Finish() { + Rep* r = rep_; + Flush(); + assert(!r->closed); + r->closed = true; + BlockHandle metaindex_block_handle; + BlockHandle index_block_handle; + if (ok()) { + BlockBuilder meta_index_block(&r->options); + // TODO(postrelease): Add stats and other meta blocks + WriteBlock(&meta_index_block, &metaindex_block_handle); + } + if (ok()) { + if (r->pending_index_entry) { + r->options.comparator->FindShortSuccessor(&r->last_key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + r->pending_index_entry = false; + } + WriteBlock(&r->index_block, &index_block_handle); + } + if (ok()) { + Footer footer; + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } + } + return r->status; +} + +void TableBuilder::Abandon() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; +} + +uint64_t TableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t TableBuilder::FileSize() const { + return rep_->offset; +} + +} diff --git a/leveldb/table/table_test.cc b/leveldb/table/table_test.cc new file mode 100644 index 0000000..4b3e85e --- /dev/null +++ b/leveldb/table/table_test.cc @@ -0,0 +1,841 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table.h" + +#include +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "leveldb/table_builder.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +// Return reverse of "key". +// Used to test non-lexicographic comparators. +static std::string Reverse(const Slice& key) { + std::string str(key.ToString()); + std::string rev(str.rbegin(), str.rend()); + return rev; +} + +namespace { +class ReverseKeyComparator : public Comparator { + public: + virtual const char* Name() const { + return "leveldb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + std::string s = Reverse(*start); + std::string l = Reverse(limit); + BytewiseComparator()->FindShortestSeparator(&s, l); + *start = Reverse(s); + } + + virtual void FindShortSuccessor(std::string* key) const { + std::string s = Reverse(*key); + BytewiseComparator()->FindShortSuccessor(&s); + *key = Reverse(s); + } +}; +} +static ReverseKeyComparator reverse_key_comparator; + +static void Increment(const Comparator* cmp, std::string* key) { + if (cmp == BytewiseComparator()) { + key->push_back('\0'); + } else { + assert(cmp == &reverse_key_comparator); + std::string rev = Reverse(*key); + rev.push_back('\0'); + *key = Reverse(rev); + } +} + +// An STL comparator that uses a Comparator +namespace { +struct STLLessThan { + const Comparator* cmp; + + STLLessThan() : cmp(BytewiseComparator()) { } + STLLessThan(const Comparator* c) : cmp(c) { } + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } +}; +} + +class StringSink: public WritableFile { + public: + ~StringSink() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class StringSource: public RandomAccessFile { + public: + StringSource(const Slice& contents) + : contents_(contents.data(), contents.size()) { + } + + virtual ~StringSource() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + return Status::OK(); + } + + private: + std::string contents_; +}; + +typedef std::map KVMap; + +// Helper class for tests to unify the interface between +// BlockBuilder/TableBuilder and Block/Table. +class Constructor { + public: + explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } + virtual ~Constructor() { } + + void Add(const std::string& key, const Slice& value) { + data_[key] = value.ToString(); + } + + // Finish constructing the data structure with all the keys that have + // been added so far. Returns the keys in sorted order in "*keys" + // and stores the key/value pairs in "*kvmap" + void Finish(const Options& options, + std::vector* keys, + KVMap* kvmap) { + *kvmap = data_; + keys->clear(); + for (KVMap::const_iterator it = data_.begin(); + it != data_.end(); + ++it) { + keys->push_back(it->first); + } + data_.clear(); + Status s = FinishImpl(options, *kvmap); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + // Construct the data structure from the data in "data" + virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; + + virtual size_t NumBytes() const = 0; + + virtual Iterator* NewIterator() const = 0; + + virtual const KVMap& data() { return data_; } + + virtual DB* db() const { return NULL; } // Overridden in DBConstructor + + private: + KVMap data_; +}; + +class BlockConstructor: public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp), + block_size_(-1), + block_(NULL) { } + ~BlockConstructor() { + delete block_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete block_; + block_ = NULL; + BlockBuilder builder(&options); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + } + // Open the block + Slice block_data = builder.Finish(); + block_size_ = block_data.size(); + char* block_data_copy = new char[block_size_]; + memcpy(block_data_copy, block_data.data(), block_size_); + block_ = new Block(block_data_copy, block_size_); + return Status::OK(); + } + virtual size_t NumBytes() const { return block_size_; } + + virtual Iterator* NewIterator() const { + return block_->NewIterator(comparator_); + } + + private: + const Comparator* comparator_; + int block_size_; + Block* block_; + + BlockConstructor(); +}; + +class TableConstructor: public Constructor { + public: + TableConstructor(const Comparator* cmp) + : Constructor(cmp), + source_(NULL), table_(NULL) { + } + ~TableConstructor() { + Reset(); + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + Reset(); + StringSink sink; + TableBuilder builder(options, &sink); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + ASSERT_TRUE(builder.status().ok()); + } + Status s = builder.Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + + ASSERT_EQ(sink.contents().size(), builder.FileSize()); + + // Open the table + source_ = new StringSource(sink.contents()); + Options table_options; + table_options.comparator = options.comparator; + return Table::Open(table_options, source_, sink.contents().size(), &table_); + } + virtual size_t NumBytes() const { return source_->Size(); } + + virtual Iterator* NewIterator() const { + return table_->NewIterator(ReadOptions()); + } + + uint64_t ApproximateOffsetOf(const Slice& key) const { + return table_->ApproximateOffsetOf(key); + } + + private: + void Reset() { + delete table_; + delete source_; + table_ = NULL; + source_ = NULL; + } + + StringSource* source_; + Table* table_; + + TableConstructor(); +}; + +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator: public Iterator { + public: + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + + virtual Slice key() const { + assert(Valid()); + ParsedInternalKey key; + if (!ParseInternalKey(iter_->key(), &key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return key.user_key; + } + + virtual Slice value() const { return iter_->value(); } + virtual Status status() const { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + Iterator* iter_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class MemTableConstructor: public Constructor { + public: + explicit MemTableConstructor(const Comparator* cmp) + : Constructor(cmp), + internal_comparator_(cmp) { + memtable_ = new MemTable(internal_comparator_); + } + ~MemTableConstructor() { + delete memtable_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete memtable_; + memtable_ = new MemTable(internal_comparator_); + int seq = 1; + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + memtable_->Add(seq, kTypeValue, it->first, it->second); + seq++; + } + return Status::OK(); + } + virtual size_t NumBytes() const { + return memtable_->ApproximateMemoryUsage(); + } + + virtual Iterator* NewIterator() const { + return new KeyConvertingIterator(memtable_->NewIterator()); + } + + private: + InternalKeyComparator internal_comparator_; + MemTable* memtable_; +}; + +class DBConstructor: public Constructor { + public: + explicit DBConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp) { + db_ = NULL; + NewDB(); + } + ~DBConstructor() { + delete db_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete db_; + db_ = NULL; + NewDB(); + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + WriteBatch batch; + batch.Put(it->first, it->second); + ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); + } + return Status::OK(); + } + virtual size_t NumBytes() const { + Range r("", "\xff\xff"); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + virtual Iterator* NewIterator() const { + return db_->NewIterator(ReadOptions()); + } + + virtual DB* db() const { return db_; } + + private: + void NewDB() { + std::string name = test::TmpDir() + "/table_testdb"; + + Options options; + options.comparator = comparator_; + Status status = DestroyDB(name, options); + ASSERT_TRUE(status.ok()) << status.ToString(); + + options.create_if_missing = true; + options.error_if_exists = true; + options.write_buffer_size = 10000; // Something small to force merging + status = DB::Open(options, name, &db_); + ASSERT_TRUE(status.ok()) << status.ToString(); + } + + const Comparator* comparator_; + DB* db_; +}; + +enum TestType { + TABLE_TEST, + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST, +}; + +struct TestArgs { + TestType type; + bool reverse_compare; + int restart_interval; +}; + +static const TestArgs kTestArgList[] = { + { TABLE_TEST, false, 16 }, + { TABLE_TEST, false, 1 }, + { TABLE_TEST, false, 1024 }, + { TABLE_TEST, true, 16 }, + { TABLE_TEST, true, 1 }, + { TABLE_TEST, true, 1024 }, + + { BLOCK_TEST, false, 16 }, + { BLOCK_TEST, false, 1 }, + { BLOCK_TEST, false, 1024 }, + { BLOCK_TEST, true, 16 }, + { BLOCK_TEST, true, 1 }, + { BLOCK_TEST, true, 1024 }, + + // Restart interval does not matter for memtables + { MEMTABLE_TEST, false, 16 }, + { MEMTABLE_TEST, true, 16 }, + + // Do not bother with restart interval variations for DB + { DB_TEST, false, 16 }, + { DB_TEST, true, 16 }, +}; +static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); + +class Harness { + public: + Harness() : constructor_(NULL) { } + + void Init(const TestArgs& args) { + delete constructor_; + constructor_ = NULL; + options_ = Options(); + + options_.block_restart_interval = args.restart_interval; + // Use shorter block size for tests to exercise block boundary + // conditions more. + options_.block_size = 256; + if (args.reverse_compare) { + options_.comparator = &reverse_key_comparator; + } + switch (args.type) { + case TABLE_TEST: + constructor_ = new TableConstructor(options_.comparator); + break; + case BLOCK_TEST: + constructor_ = new BlockConstructor(options_.comparator); + break; + case MEMTABLE_TEST: + constructor_ = new MemTableConstructor(options_.comparator); + break; + case DB_TEST: + constructor_ = new DBConstructor(options_.comparator); + break; + } + } + + ~Harness() { + delete constructor_; + } + + void Add(const std::string& key, const std::string& value) { + constructor_->Add(key, value); + } + + void Test(Random* rnd) { + std::vector keys; + KVMap data; + constructor_->Finish(options_, &keys, &data); + + TestForwardScan(keys, data); + TestBackwardScan(keys, data); + TestRandomAccess(rnd, keys, data); + } + + void TestForwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + for (KVMap::const_iterator model_iter = data.begin(); + model_iter != data.end(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestBackwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + for (KVMap::const_reverse_iterator model_iter = data.rbegin(); + model_iter != data.rend(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Prev(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestRandomAccess(Random* rnd, + const std::vector& keys, + const KVMap& data) { + static const bool kVerbose = false; + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + KVMap::const_iterator model_iter = data.begin(); + if (kVerbose) fprintf(stderr, "---\n"); + for (int i = 0; i < 200; i++) { + const int toss = rnd->Uniform(5); + switch (toss) { + case 0: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Next\n"); + iter->Next(); + ++model_iter; + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 1: { + if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + iter->SeekToFirst(); + model_iter = data.begin(); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 2: { + std::string key = PickRandomKey(rnd, keys); + model_iter = data.lower_bound(key); + if (kVerbose) fprintf(stderr, "Seek '%s'\n", + EscapeString(key).c_str()); + iter->Seek(Slice(key)); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 3: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Prev\n"); + iter->Prev(); + if (model_iter == data.begin()) { + model_iter = data.end(); // Wrap around to invalid value + } else { + --model_iter; + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 4: { + if (kVerbose) fprintf(stderr, "SeekToLast\n"); + iter->SeekToLast(); + if (keys.empty()) { + model_iter = data.end(); + } else { + std::string last = data.rbegin()->first; + model_iter = data.lower_bound(last); + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + } + } + delete iter; + } + + std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { + if (it == data.end()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const KVMap& data, + const KVMap::const_reverse_iterator& it) { + if (it == data.rend()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const Iterator* it) { + if (!it->Valid()) { + return "END"; + } else { + return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; + } + } + + std::string PickRandomKey(Random* rnd, const std::vector& keys) { + if (keys.empty()) { + return "foo"; + } else { + const int index = rnd->Uniform(keys.size()); + std::string result = keys[index]; + switch (rnd->Uniform(3)) { + case 0: + // Return an existing key + break; + case 1: { + // Attempt to return something smaller than an existing key + if (result.size() > 0 && result[result.size()-1] > '\0') { + result[result.size()-1]--; + } + break; + } + case 2: { + // Return something larger than an existing key + Increment(options_.comparator, &result); + break; + } + } + return result; + } + } + + // Returns NULL if not running against a DB + DB* db() const { return constructor_->db(); } + + private: + Options options_; + Constructor* constructor_; +}; + +// Test the empty key +TEST(Harness, SimpleEmptyKey) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSingle) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleMulti) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSpecialKey) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + +TEST(Harness, Randomized) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + if ((num_entries % 10) == 0) { + fprintf(stderr, "case %d of %d: num_entries = %d\n", + (i + 1), int(kNumTestArgs), num_entries); + } + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + } + } +} + +TEST(Harness, RandomizedLongDB) { + Random rnd(test::RandomSeed()); + TestArgs args = { DB_TEST, false, 16 }; + Init(args); + int num_entries = 100000; + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + + // We must have created enough data to force merging + std::string l0_files, l1_files; + ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); + ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); + ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0); + +} + +class MemTableTest { }; + +TEST(MemTableTest, Simple) { + InternalKeyComparator cmp(BytewiseComparator()); + MemTable memtable(cmp); + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + batch.Put(std::string("k1"), std::string("v1")); + batch.Put(std::string("k2"), std::string("v2")); + batch.Put(std::string("k3"), std::string("v3")); + batch.Put(std::string("largekey"), std::string("vlarge")); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); + + Iterator* iter = memtable.NewIterator(); + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", + iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } + + delete iter; +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +class TableTest { }; + +TEST(TableTest, ApproximateOffsetOfPlain) { + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kNoCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); + +} + +static bool SnappyCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(in.data(), in.size(), &out); +} + +TEST(TableTest, ApproximateOffsetOfCompressed) { + if (!SnappyCompressionSupported()) { + fprintf(stderr, "skipping compression tests\n"); + return; + } + + Random rnd(301); + TableConstructor c(BytewiseComparator()); + std::string tmp; + c.Add("k01", "hello"); + c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + c.Add("k03", "hello3"); + c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kSnappyCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/table/two_level_iterator.cc b/leveldb/table/two_level_iterator.cc new file mode 100644 index 0000000..24a1241 --- /dev/null +++ b/leveldb/table/two_level_iterator.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" + +#include "leveldb/table.h" +#include "table/block.h" +#include "table/format.h" +#include "table/iterator_wrapper.h" + +namespace leveldb { + +namespace { + +typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); + +class TwoLevelIterator: public Iterator { + public: + TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options); + + virtual ~TwoLevelIterator(); + + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + virtual void Next(); + virtual void Prev(); + + virtual bool Valid() const { + return data_iter_.Valid(); + } + virtual Slice key() const { + assert(Valid()); + return data_iter_.key(); + } + virtual Slice value() const { + assert(Valid()); + return data_iter_.value(); + } + virtual Status status() const { + // It'd be nice if status() returned a const Status& instead of a Status + if (!index_iter_.status().ok()) { + return index_iter_.status(); + } else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { + return data_iter_.status(); + } else { + return status_; + } + } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetDataIterator(Iterator* data_iter); + void InitDataBlock(); + + BlockFunction block_function_; + void* arg_; + const ReadOptions options_; + Status status_; + IteratorWrapper index_iter_; + IteratorWrapper data_iter_; // May be NULL + // If data_iter_ is non-NULL, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the data_iter_. + std::string data_block_handle_; +}; + +TwoLevelIterator::TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options) + : block_function_(block_function), + arg_(arg), + options_(options), + index_iter_(index_iter), + data_iter_(NULL) { +} + +TwoLevelIterator::~TwoLevelIterator() { +} + +void TwoLevelIterator::Seek(const Slice& target) { + index_iter_.Seek(target); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.Seek(target); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToFirst() { + index_iter_.SeekToFirst(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToLast() { + index_iter_.SeekToLast(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIterator::Next() { + assert(Valid()); + data_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::Prev() { + assert(Valid()); + data_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + + +void TwoLevelIterator::SkipEmptyDataBlocksForward() { + while (data_iter_.iter() == NULL || !data_iter_.Valid()) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + return; + } + index_iter_.Next(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); + } +} + +void TwoLevelIterator::SkipEmptyDataBlocksBackward() { + while (data_iter_.iter() == NULL || !data_iter_.Valid()) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + return; + } + index_iter_.Prev(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); + } +} + +void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { + if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); + data_iter_.Set(data_iter); +} + +void TwoLevelIterator::InitDataBlock() { + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + } else { + Slice handle = index_iter_.value(); + if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { + // data_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + Iterator* iter = (*block_function_)(arg_, options_, handle); + data_block_handle_.assign(handle.data(), handle.size()); + SetDataIterator(iter); + } + } +} + +} + +Iterator* NewTwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options) { + return new TwoLevelIterator(index_iter, block_function, arg, options); +} + +} diff --git a/leveldb/table/two_level_iterator.h b/leveldb/table/two_level_iterator.h new file mode 100644 index 0000000..5909e2b --- /dev/null +++ b/leveldb/table/two_level_iterator.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ +#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ + +#include "leveldb/iterator.h" + +namespace leveldb { + +struct ReadOptions; + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +extern Iterator* NewTwoLevelIterator( + Iterator* index_iter, + Iterator* (*block_function)( + void* arg, + const ReadOptions& options, + const Slice& index_value), + void* arg, + const ReadOptions& options); + +} + +#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ diff --git a/leveldb/util/arena.cc b/leveldb/util/arena.cc new file mode 100644 index 0000000..40ab99d --- /dev/null +++ b/leveldb/util/arena.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include + +namespace leveldb { + +static const int kBlockSize = 4096; + +Arena::Arena() { + blocks_memory_ = 0; + alloc_ptr_ = NULL; // First allocation will allocate a block + alloc_bytes_remaining_ = 0; +} + +Arena::~Arena() { + for (size_t i = 0; i < blocks_.size(); i++) { + delete[] blocks_[i]; + } +} + +char* Arena::AllocateFallback(size_t bytes) { + if (bytes > kBlockSize / 4) { + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + char* result = AllocateNewBlock(bytes); + return result; + } + + // We waste the remaining space in the current block. + alloc_ptr_ = AllocateNewBlock(kBlockSize); + alloc_bytes_remaining_ = kBlockSize; + + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; +} + +char* Arena::AllocateAligned(size_t bytes) { + const int align = sizeof(void*); // We'll align to pointer size + assert((align & (align-1)) == 0); // Pointer size should be a power of 2 + size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); + size_t slop = (current_mod == 0 ? 0 : align - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = alloc_ptr_ + slop; + alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returned aligned memory + result = AllocateFallback(bytes); + } + assert((reinterpret_cast(result) & (align-1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + char* result = new char[block_bytes]; + blocks_memory_ += block_bytes; + blocks_.push_back(result); + return result; +} + +} diff --git a/leveldb/util/arena.h b/leveldb/util/arena.h new file mode 100644 index 0000000..fcb5d5b --- /dev/null +++ b/leveldb/util/arena.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ +#define STORAGE_LEVELDB_UTIL_ARENA_H_ + +#include +#include +#include +#include + +namespace leveldb { + +class Arena { + public: + Arena(); + ~Arena(); + + // Return a pointer to a newly allocated memory block of "bytes" bytes. + char* Allocate(size_t bytes); + + // Allocate memory with the normal alignment guarantees provided by malloc + char* AllocateAligned(size_t bytes); + + // Returns an estimate of the total memory usage of data allocated + // by the arena (including space allocated but not yet used for user + // allocations). + size_t MemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*); + } + + private: + char* AllocateFallback(size_t bytes); + char* AllocateNewBlock(size_t block_bytes); + + // Allocation state + char* alloc_ptr_; + size_t alloc_bytes_remaining_; + + // Array of new[] allocated memory blocks + std::vector blocks_; + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_; + + // No copying allowed + Arena(const Arena&); + void operator=(const Arena&); +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; + } + return AllocateFallback(bytes); +} + +} + +#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/leveldb/util/arena_test.cc b/leveldb/util/arena_test.cc new file mode 100644 index 0000000..c33b552 --- /dev/null +++ b/leveldb/util/arena_test.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" + +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { + +class ArenaTest { }; + +TEST(ArenaTest, Empty) { + Arena arena; +} + +TEST(ArenaTest, Simple) { + std::vector > allocated; + Arena arena; + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) ? rnd.Uniform(6000) : + (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena.AllocateAligned(s); + } else { + r = arena.Allocate(s); + } + + for (int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena.MemoryUsage(), bytes); + if (i > N/10) { + ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); + } + } + for (int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, i % 256); + } + } +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/util/cache.cc b/leveldb/util/cache.cc new file mode 100644 index 0000000..d8a4426 --- /dev/null +++ b/leveldb/util/cache.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) +#include +#elif defined(LEVELDB_PLATFORM_CHROMIUM) +#include "base/hash_tables.h" +#else +#include // TODO(sanjay): Switch to unordered_set when possible. +#endif + +#include + +#include "leveldb/cache.h" +#include "port/port.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace leveldb { + +Cache::~Cache() { +} + +namespace { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + size_t refs; // TODO(opt): Pack with "key_length"? + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// Pick a platform specific hash_set instantiation +#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) + // Microsoft's hash_set deviates from the standard. See + // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx + // for details. Basically the 2 param () operator is a less than and + // the 1 param () operator is a hash function. + struct HandleHashCompare : public stdext::hash_compare { + size_t operator() (LRUHandle* h) const { + Slice k = h->key(); + return Hash(k.data(), k.size(), 0); + } + bool operator() (LRUHandle* a, LRUHandle* b) const { + return a->key().compare(b->key()) < 0; + } + }; + typedef base::hash_set HandleTable; +#else + struct HandleHash { + inline size_t operator()(LRUHandle* h) const { + Slice k = h->key(); + return Hash(k.data(), k.size(), 0); + } + }; + + struct HandleEq { + inline bool operator()(LRUHandle* a, LRUHandle* b) const { + return a->key() == b->key(); + } + }; +# if defined(LEVELDB_PLATFORM_CHROMIUM) + typedef base::hash_set HandleTable; +# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) + typedef std::unordered_set HandleTable; +# else + typedef __gnu_cxx::hash_set HandleTable; +# endif +#endif + +class LRUCache : public Cache { + public: + explicit LRUCache(size_t capacity); + virtual ~LRUCache(); + + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)); + virtual Handle* Lookup(const Slice& key); + virtual void Release(Handle* handle); + virtual void* Value(Handle* handle); + virtual void Erase(const Slice& key); + virtual uint64_t NewId(); + + private: + void LRU_Remove(LRUHandle* e); + void LRU_Append(LRUHandle* e); + void Unref(LRUHandle* e); + + // Constructor parameters + const size_t capacity_; + + // mutex_ protects the following state. + port::Mutex mutex_; + size_t usage_; + uint64_t last_id_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; +}; + +LRUCache::LRUCache(size_t capacity) + : capacity_(capacity), + usage_(0), + last_id_(0) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +LRUCache::~LRUCache() { + table_.clear(); + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + assert(e->refs == 1); // Error if caller has an unreleased handle + Unref(e); + e = next; + } +} + +void LRUCache::Unref(LRUHandle* e) { + assert(e->refs > 0); + e->refs--; + if (e->refs <= 0) { + usage_ -= e->charge; + (*e->deleter)(e->key(), e->value); + free(e); + } +} + +void LRUCache::LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; +} + +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; +} + +Cache::Handle* LRUCache::Lookup(const Slice& key) { + MutexLock l(&mutex_); + + LRUHandle dummy; + dummy.next = &dummy; + dummy.value = const_cast(&key); + HandleTable::iterator iter = table_.find(&dummy); + if (iter == table_.end()) { + return NULL; + } else { + LRUHandle* e = const_cast(*iter); + e->refs++; + LRU_Remove(e); + LRU_Append(e); + return reinterpret_cast(e); + } +} + +void* LRUCache::Value(Handle* handle) { + return reinterpret_cast(handle)->value; +} + +void LRUCache::Release(Handle* handle) { + MutexLock l(&mutex_); + Unref(reinterpret_cast(handle)); +} + +Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + MutexLock l(&mutex_); + + LRUHandle* e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); + LRU_Append(e); + usage_ += charge; + + std::pair p = table_.insert(e); + if (!p.second) { + // Kill existing entry + LRUHandle* old = const_cast(*(p.first)); + LRU_Remove(old); + table_.erase(p.first); + table_.insert(e); + Unref(old); + } + + while (usage_ > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + LRU_Remove(old); + table_.erase(old); + Unref(old); + } + + return reinterpret_cast(e); +} + +void LRUCache::Erase(const Slice& key) { + MutexLock l(&mutex_); + + LRUHandle dummy; + dummy.next = &dummy; + dummy.value = const_cast(&key); + HandleTable::iterator iter = table_.find(&dummy); + if (iter != table_.end()) { + LRUHandle* e = const_cast(*iter); + LRU_Remove(e); + table_.erase(iter); + Unref(e); + } +} + +uint64_t LRUCache::NewId() { + MutexLock l(&mutex_); + return ++(last_id_); +} + +} // end anonymous namespace + +Cache* NewLRUCache(size_t capacity) { + return new LRUCache(capacity); +} + +} diff --git a/leveldb/util/cache_test.cc b/leveldb/util/cache_test.cc new file mode 100644 index 0000000..dbab988 --- /dev/null +++ b/leveldb/util/cache_test.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/cache.h" + +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace leveldb { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest { + public: + static CacheTest* current_; + + static void Deleter(const Slice& key, void* v) { + current_->deleted_keys_.push_back(DecodeKey(key)); + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kCacheSize = 100; + std::vector deleted_keys_; + std::vector deleted_values_; + Cache* cache_; + + CacheTest() : cache_(NewLRUCache(kCacheSize)) { + current_ = this; + } + + ~CacheTest() { + delete cache_; + } + + int Lookup(int key) { + Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); + const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); + if (handle != NULL) { + cache_->Release(handle); + } + return r; + } + + void Insert(int key, int value, int charge = 1) { + cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(int key) { + cache_->Erase(EncodeKey(key)); + } +}; +CacheTest* CacheTest::current_; + +TEST(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); +} + +TEST(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + + // Frequently used entry must be kept around + for (int i = 0; i < kCacheSize; i++) { + Insert(1000+i, 2000+i); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(200, deleted_keys_[0]); + ASSERT_EQ(201, deleted_values_[0]); +} + +TEST(CacheTest, HeavyEntry) { + Insert(100, 101); + Insert(200, 201, kCacheSize); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/util/coding.cc b/leveldb/util/coding.cc new file mode 100644 index 0000000..14f21f7 --- /dev/null +++ b/leveldb/util/coding.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +namespace leveldb { + +void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return reinterpret_cast(ptr); +} + +void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B-1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return NULL; +} + +bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return NULL; +} + +bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetLengthPrefixedSlice(const char* p, const char* limit, + Slice* result) { + uint32_t len; + p = GetVarint32Ptr(p, limit, &len); + if (p == NULL) return NULL; + if (p + len > limit) return NULL; + *result = Slice(p, len); + return p + len; +} + +bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len; + if (GetVarint32(input, &len) && + input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +} diff --git a/leveldb/util/coding.h b/leveldb/util/coding.h new file mode 100644 index 0000000..8755968 --- /dev/null +++ b/leveldb/util/coding.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ +#define STORAGE_LEVELDB_UTIL_CODING_H_ + +#include +#include +#include +#include "leveldb/slice.h" +#include "port/port.h" + +namespace leveldb { + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// NULL on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(ptr[0])) + | (static_cast(ptr[1]) << 8) + | (static_cast(ptr[2]) << 16) + | (static_cast(ptr[3]) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +} + +#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/leveldb/util/coding_test.cc b/leveldb/util/coding_test.cc new file mode 100644 index 0000000..a8dba04 --- /dev/null +++ b/leveldb/util/coding_test.cc @@ -0,0 +1,173 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "util/testharness.h" + +namespace leveldb { + +class Coding { }; + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + uint64_t actual; + actual = DecodeFixed64(p); + ASSERT_EQ(v-1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+1, actual); + p += sizeof(uint64_t); + } +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != NULL); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast(0)); + values.push_back(~static_cast(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power-1); + values.push_back(power+1); + }; + + std::string s; + for (int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != NULL); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); + +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) + == NULL); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); + } + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) + == NULL); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); + } + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/util/comparator.cc b/leveldb/util/comparator.cc new file mode 100644 index 0000000..cc2b263 --- /dev/null +++ b/leveldb/util/comparator.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "leveldb/comparator.h" +#include "leveldb/slice.h" +#include "util/logging.h" + +namespace leveldb { + +Comparator::~Comparator() { } + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast((*start)[diff_index]); + if (diff_byte < static_cast(0xff) && + diff_byte + 1 < static_cast(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} +static const BytewiseComparatorImpl bytewise; + +const Comparator* BytewiseComparator() { + return &bytewise; +} + +} diff --git a/leveldb/util/crc32c.cc b/leveldb/util/crc32c.cc new file mode 100644 index 0000000..28c2401 --- /dev/null +++ b/leveldb/util/crc32c.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. + +#include "util/crc32c.h" + +#include +#include "util/coding.h" + +namespace leveldb { +namespace crc32c { + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast(p)); +} + +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint32_t l = crc ^ 0xffffffffu; + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) +#define STEP4 do { \ + uint32_t c = l ^ LE_LOAD32(p); \ + p += 4; \ + l = table3_[c & 0xff] ^ \ + table2_[(c >> 8) & 0xff] ^ \ + table1_[(c >> 16) & 0xff] ^ \ + table0_[c >> 24]; \ +} while (0) + + // Point x at first 4-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); + if (x <= e) { + // Process bytes until finished or p is 4-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + STEP4; STEP4; STEP4; STEP4; + } + // Process bytes 4 at a time + while ((e-p) >= 4) { + STEP4; + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP4 +#undef STEP1 + return l ^ 0xffffffffu; +} + +} +} diff --git a/leveldb/util/crc32c.h b/leveldb/util/crc32c.h new file mode 100644 index 0000000..938d8ff --- /dev/null +++ b/leveldb/util/crc32c.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ +#define STORAGE_LEVELDB_UTIL_CRC32C_H_ + +#include +#include + +namespace leveldb { +namespace crc32c { + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { + return Extend(0, data, n); +} + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} +} + +#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ diff --git a/leveldb/util/crc32c_test.cc b/leveldb/util/crc32c_test.cc new file mode 100644 index 0000000..ba9e804 --- /dev/null +++ b/leveldb/util/crc32c_test.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/crc32c.h" +#include "util/testharness.h" + +namespace leveldb { +namespace crc32c { + +class CRC { }; + +TEST(CRC, StandardResults) { + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = i; + } + ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = 31 - i; + } + ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); +} + +TEST(CRC, Values) { + ASSERT_NE(Value("a", 1), Value("foo", 3)); +} + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), + Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/util/env.cc b/leveldb/util/env.cc new file mode 100644 index 0000000..e5297e7 --- /dev/null +++ b/leveldb/util/env.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/env.h" + +namespace leveldb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +FileLock::~FileLock() { +} + +void Log(Env* env, WritableFile* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + env->Logv(info_log, format, ap); + va_end(ap); +} + +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + WritableFile* file; + Status s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok()) { + s = file->Close(); + } + delete file; // Will auto-close if we did not close above + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + data->clear(); + SequentialFile* file; + Status s = env->NewSequentialFile(fname, &file); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + delete file; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +} diff --git a/leveldb/util/env_chromium.cc b/leveldb/util/env_chromium.cc new file mode 100644 index 0000000..7edc7a9 --- /dev/null +++ b/leveldb/util/env_chromium.cc @@ -0,0 +1,603 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "base/at_exit.h" +#include "base/file_path.h" +#include "base/file_util.h" +#include "base/lazy_instance.h" +#include "base/memory/ref_counted.h" +#include "base/message_loop.h" +#include "base/platform_file.h" +#include "base/process_util.h" +#include "base/synchronization/lock.h" +#include "base/sys_info.h" +#include "base/task.h" +#include "base/threading/platform_thread.h" +#include "base/threading/thread.h" +#include "base/utf_string_conversions.h" +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +#if defined(OS_WIN) +#include +#include "base/win/win_util.h" +#endif + +#if defined(OS_MACOSX) || defined(OS_WIN) +// The following are glibc-specific +extern "C" { +size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { + return fread(ptr, size, n, file); +} + +size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) { + return fwrite(ptr, size, n, file); +} + +int fflush_unlocked(FILE *file) { + return fflush(file); +} + +int fdatasync(int fildes) { +#if defined(OS_WIN) + return _commit(fildes); +#else + return fsync(fildes); +#endif +} +} +#endif + +namespace leveldb { + +namespace { + +class Thread; + +static const ::FilePath::CharType kLevelDBTestDirectoryPrefix[] + = FILE_PATH_LITERAL("leveldb-test-"); + +::FilePath CreateFilePath(const std::string& file_path) { +#if defined(OS_WIN) + return FilePath(UTF8ToUTF16(file_path)); +#else + return FilePath(file_path); +#endif +} + +std::string FilePathToString(const ::FilePath& file_path) { +#if defined(OS_WIN) + return UTF16ToUTF8(file_path.value()); +#else + return file_path.value(); +#endif +} + +// TODO(jorlow): This should be moved into Chromium's base. +const char* PlatformFileErrorString(const ::base::PlatformFileError& error) { + switch (error) { + case ::base::PLATFORM_FILE_ERROR_FAILED: + return "Opening file failed."; + case ::base::PLATFORM_FILE_ERROR_IN_USE: + return "File currently in use."; + case ::base::PLATFORM_FILE_ERROR_EXISTS: + return "File already exists."; + case ::base::PLATFORM_FILE_ERROR_NOT_FOUND: + return "File not found."; + case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED: + return "Access denied."; + case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED: + return "Too many files open."; + case ::base::PLATFORM_FILE_ERROR_NO_MEMORY: + return "Out of memory."; + case ::base::PLATFORM_FILE_ERROR_NO_SPACE: + return "No space left on drive."; + case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY: + return "Not a directory."; + case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION: + return "Invalid operation."; + case ::base::PLATFORM_FILE_ERROR_SECURITY: + return "Security error."; + case ::base::PLATFORM_FILE_ERROR_ABORT: + return "File operation aborted."; + case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE: + return "The supplied path was not a file."; + case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY: + return "The file was not empty."; + } + NOTIMPLEMENTED(); + return "Unknown error."; +} + +class ChromiumSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + ChromiumSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~ChromiumSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + } + return s; + } +}; + +class ChromiumRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + ::base::PlatformFile file_; + + public: + ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file) + : filename_(fname), file_(file) { } + virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + int r = ::base::ReadPlatformFile(file_, offset, scratch, n); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = Status::IOError(filename_, "Could not preform read"); + } + return s; + } +}; + +class ChromiumWritableFile : public WritableFile { + private: + std::string filename_; + FILE* file_; + + public: + ChromiumWritableFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + + ~ChromiumWritableFile() { + if (file_ != NULL) { + // Ignoring any potential errors + fclose(file_); + } + } + + virtual Status Append(const Slice& data) { + size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); + Status result; + if (r != data.size()) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } + + virtual Status Close() { + Status result; + if (fclose(file_) != 0) { + result = Status::IOError(filename_, strerror(errno)); + } + file_ = NULL; + return result; + } + + virtual Status Flush() { + Status result; + if (fflush_unlocked(file_) != 0) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } + + virtual Status Sync() { + Status result; + if ((fflush_unlocked(file_) != 0) || + (fdatasync(fileno(file_)) != 0)) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } +}; + +class ChromiumFileLock : public FileLock { + public: + ::base::PlatformFile file_; +}; + +class ChromiumEnv : public Env { + public: + ChromiumEnv(); + virtual ~ChromiumEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "rb"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new ChromiumSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN; + bool created; + ::base::PlatformFileError error_code; + ::base::PlatformFile file = ::base::CreatePlatformFile( + CreateFilePath(fname), flags, &created, &error_code); + if (error_code != ::base::PLATFORM_FILE_OK) { + *result = NULL; + return Status::IOError(fname, PlatformFileErrorString(error_code)); + } + *result = new ChromiumRandomAccessFile(fname, file); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + *result = NULL; + FILE* f = fopen(fname.c_str(), "wb"); + if (f == NULL) { + return Status::IOError(fname, strerror(errno)); + } else { + *result = new ChromiumWritableFile(fname, f); + return Status::OK(); + } + } + + virtual bool FileExists(const std::string& fname) { + return ::file_util::PathExists(CreateFilePath(fname)); + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + ::file_util::FileEnumerator iter( + CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES); + ::FilePath current = iter.Next(); + while (!current.empty()) { + result->push_back(FilePathToString(current.BaseName())); + current = iter.Next(); + } + // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so + // we'll always return OK. Maybe manually check for error + // conditions like the file not existing? + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + // TODO(jorlow): Should we assert this is a file? + if (!::file_util::Delete(CreateFilePath(fname), false)) { + result = Status::IOError(fname, "Could not delete file."); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (!::file_util::CreateDirectory(CreateFilePath(name))) { + result = Status::IOError(name, "Could not create directory."); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + // TODO(jorlow): Should we assert this is a directory? + if (!::file_util::Delete(CreateFilePath(name), false)) { + result = Status::IOError(name, "Could not delete directory."); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + int64_t signed_size; + if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { + *size = 0; + s = Status::IOError(fname, "Could not determine file size."); + } else { + *size = static_cast(signed_size); + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& dst) { + Status result; + if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) { + result = Status::IOError(src, "Could not rename file."); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS | + ::base::PLATFORM_FILE_READ | + ::base::PLATFORM_FILE_WRITE | + ::base::PLATFORM_FILE_EXCLUSIVE_READ | + ::base::PLATFORM_FILE_EXCLUSIVE_WRITE; + bool created; + ::base::PlatformFileError error_code; + ::base::PlatformFile file = ::base::CreatePlatformFile( + CreateFilePath(fname), flags, &created, &error_code); + if (error_code != ::base::PLATFORM_FILE_OK) { + result = Status::IOError(fname, PlatformFileErrorString(error_code)); + } else { + ChromiumFileLock* my_lock = new ChromiumFileLock; + my_lock->file_ = file; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + ChromiumFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (!::base::ClosePlatformFile(my_lock->file_)) { + result = Status::IOError("Could not close lock file."); + } + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual std::string UserIdentifier() { +#if defined(OS_WIN) + std::wstring user_sid; + bool ret = ::base::win::GetUserSidString(&user_sid); + DCHECK(ret); + return UTF16ToUTF8(user_sid); +#else + char buf[100]; + snprintf(buf, sizeof(buf), "%d", int(geteuid())); + return buf; +#endif + } + + virtual Status GetTestDirectory(std::string* path) { + mu_.Acquire(); + if (test_directory_.empty()) { + if (!::file_util::CreateNewTempDirectory(kLevelDBTestDirectoryPrefix, + &test_directory_)) { + mu_.Release(); + return Status::IOError("Could not create temp directory."); + } + } + *path = FilePathToString(test_directory_); + mu_.Release(); + return Status::OK(); + } + + virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + // TODO(jorlow): We may want to just use Chromium's built in logging. + + uint64_t thread_id = 0; + // Coppied from base/logging.cc. +#if defined(OS_WIN) + thread_id = GetCurrentThreadId(); +#elif defined(OS_MACOSX) + thread_id = mach_thread_self(); +#elif defined(OS_LINUX) + thread_id = syscall(__NR_gettid); +#elif defined(OS_FREEBSD) || defined(OS_NACL) + // TODO(BSD): find a better thread ID + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); +#endif + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + ::base::Time::Exploded t; + ::base::Time::Now().LocalExplode(&t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.year, + t.month, + t.day_of_month, + t.hour, + t.minute, + t.second, + static_cast(t.millisecond) * 1000, + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + info_log->Append(Slice(base, p - base)); + info_log->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) { + ::base::Time::Exploded t; + ::base::Time::Now().LocalExplode(&t); + return snprintf(buffer, size, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d", + t.year, + t.month, + t.day_of_month, + t.hour, + t.minute, + t.second, + static_cast(t.millisecond) * 1000); + } + + virtual uint64_t NowMicros() { + return ::base::TimeTicks::HighResNow().ToInternalValue(); + } + + virtual void SleepForMicroseconds(int micros) { + // Round up to the next millisecond. + ::base::PlatformThread::Sleep((micros + 999) / 1000); + } + + private: + // BGThread() is the body of the background thread + void BGThread(); + static void BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + } + + FilePath test_directory_; + + size_t page_size_; + ::base::Lock mu_; + ::base::ConditionVariable bgsignal_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + BGQueue queue_; +}; + +ChromiumEnv::ChromiumEnv() + : page_size_(::base::SysInfo::VMAllocationGranularity()), + bgsignal_(&mu_), + started_bgthread_(false) { +#if defined(OS_MACOSX) + ::base::EnableTerminationOnHeapCorruption(); + ::base::EnableTerminationOnOutOfMemory(); +#endif // OS_MACOSX +} + +class Thread : public ::base::PlatformThread::Delegate { + public: + Thread(void (*function)(void* arg), void* arg) + : function_(function), arg_(arg) { + ::base::PlatformThreadHandle handle; + bool success = ::base::PlatformThread::Create(0, this, &handle); + DCHECK(success); + } + virtual ~Thread() {} + virtual void ThreadMain() { + (*function_)(arg_); + delete this; + } + + private: + void (*function_)(void* arg); + void* arg_; +}; + +void ChromiumEnv::Schedule(void (*function)(void*), void* arg) { + mu_.Acquire(); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + StartThread(&ChromiumEnv::BGThreadWrapper, this); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + bgsignal_.Signal(); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + mu_.Release(); +} + +void ChromiumEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + mu_.Acquire(); + while (queue_.empty()) { + bgsignal_.Wait(); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + mu_.Release(); + (*function)(arg); + } +} + +void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { + new Thread(function, arg); // Will self-delete. +} + +::base::LazyInstance > + default_env(::base::LINKER_INITIALIZED); + +} + +Env* Env::Default() { + return default_env.Pointer(); +} + +} diff --git a/leveldb/util/env_posix.cc b/leveldb/util/env_posix.cc new file mode 100644 index 0000000..5cddb0c --- /dev/null +++ b/leveldb/util/env_posix.cc @@ -0,0 +1,599 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(LEVELDB_PLATFORM_ANDROID) +#include +#endif +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +namespace leveldb { + +namespace { + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + } + return s; + } +}; + +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd) + : filename_(fname), fd_(fd) { } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + return s; + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + + // Have we done an munmap of unsynced data? + bool pending_sync_; + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + void UnmapCurrentRegion() { + if (base_ != NULL) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + munmap(base_, limit_ - base_); + file_offset_ += limit_ - base_; + base_ = NULL; + limit_ = NULL; + last_sync_ = NULL; + dst_ = NULL; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + } + + bool MapNewRegion() { + assert(base_ == NULL); + if (ftruncate(fd_, file_offset_ + map_size_) < 0) { + return false; + } + void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return false; + } + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return true; + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(NULL), + limit_(NULL), + dst_(NULL), + last_sync_(NULL), + file_offset_(0), + pending_sync_(false) { + assert((page_size & (page_size - 1)) == 0); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + UnmapCurrentRegion(); + MapNewRegion(); + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + UnmapCurrentRegion(); + if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + fd_ = -1; + base_ = NULL; + limit_ = NULL; + return s; + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + return s; + } +}; + +static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + return fcntl(fd, F_SETLK, &f); +} + +class PosixFileLock : public FileLock { + public: + int fd_; +}; + +class PosixEnv : public Env { + public: + PosixEnv(); + virtual ~PosixEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "r"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + int fd = open(fname.c_str(), O_RDONLY); + if (fd < 0) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } + *result = new PosixRandomAccessFile(fname, fd); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + *result = NULL; + s = Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixMmapFile(fname, fd, page_size_); + } + return s; + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == NULL) { + return Status::IOError(dir, strerror(errno)); + } + struct dirent* entry; + while ((entry = readdir(d)) != NULL) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = Status::IOError(fname, strerror(errno)); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = Status::IOError(name, strerror(errno)); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = Status::IOError(name, strerror(errno)); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = Status::IOError(fname, strerror(errno)); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = Status::IOError(src, strerror(errno)); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = Status::IOError(fname, strerror(errno)); + } else if (LockOrUnlock(fd, true) == -1) { + result = Status::IOError("lock " + fname, strerror(errno)); + close(fd); + } else { + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = Status::IOError(strerror(errno)); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + pthread_t tid = pthread_self(); + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, NULL); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + info_log->Append(Slice(base, p - base)); + info_log->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + + private: + void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } + } + + // BGThread() is the body of the background thread + void BGThread(); + static void* BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + return NULL; + } + + size_t page_size_; + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + pthread_t bgthread_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + BGQueue queue_; +}; + +PosixEnv::PosixEnv() : page_size_(getpagesize()), + started_bgthread_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + PthreadCall( + "create thread", + pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + while (queue_.empty()) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return NULL; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, NULL, &StartThreadWrapper, state)); +} + +} + +static pthread_once_t once = PTHREAD_ONCE_INIT; +static Env* default_env; +static void InitDefaultEnv() { default_env = new PosixEnv; } + +Env* Env::Default() { + pthread_once(&once, InitDefaultEnv); + return default_env; +} + +} diff --git a/leveldb/util/env_test.cc b/leveldb/util/env_test.cc new file mode 100644 index 0000000..3c253be --- /dev/null +++ b/leveldb/util/env_test.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/env.h" + +#include "port/port.h" +#include "util/testharness.h" + +namespace leveldb { + +static const int kDelayMicros = 100000; + +class EnvPosixTest { + private: + port::Mutex mu_; + std::string events_; + + public: + Env* env_; + EnvPosixTest() : env_(Env::Default()) { } +}; + +static void SetBool(void* ptr) { + *(reinterpret_cast(ptr)) = true; +} + +TEST(EnvPosixTest, RunImmediately) { + bool called = false; + env_->Schedule(&SetBool, &called); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(called); +} + +TEST(EnvPosixTest, RunMany) { + int last_id = 0; + + struct CB { + int* last_id_ptr; // Pointer to shared slot + int id; // Order# for the execution of this callback + + CB(int* p, int i) : last_id_ptr(p), id(i) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + ASSERT_EQ(cb->id-1, *cb->last_id_ptr); + *cb->last_id_ptr = cb->id; + } + }; + + // Schedule in different order than start time + CB cb1(&last_id, 1); + CB cb2(&last_id, 2); + CB cb3(&last_id, 3); + CB cb4(&last_id, 4); + env_->Schedule(&CB::Run, &cb1); + env_->Schedule(&CB::Run, &cb2); + env_->Schedule(&CB::Run, &cb3); + env_->Schedule(&CB::Run, &cb4); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(4, last_id); +} + +struct State { + port::Mutex mu; + int val; + int num_running; +}; + +static void ThreadBody(void* arg) { + State* s = reinterpret_cast(arg); + s->mu.Lock(); + s->val += 1; + s->num_running -= 1; + s->mu.Unlock(); +} + +TEST(EnvPosixTest, StartThread) { + State state; + state.val = 0; + state.num_running = 3; + for (int i = 0; i < 3; i++) { + env_->StartThread(&ThreadBody, &state); + } + while (true) { + state.mu.Lock(); + int num = state.num_running; + state.mu.Unlock(); + if (num == 0) { + break; + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(state.val, 3); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/leveldb/util/hash.cc b/leveldb/util/hash.cc new file mode 100644 index 0000000..d19afd1 --- /dev/null +++ b/leveldb/util/hash.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "util/coding.h" +#include "util/hash.h" + +namespace leveldb { + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // Similar to murmur hash + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = seed ^ (n * m); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + case 3: + h += data[2] << 16; + // fall through + case 2: + h += data[1] << 8; + // fall through + case 1: + h += data[0]; + h *= m; + h ^= (h >> r); + break; + } + return h; +} + + +} diff --git a/leveldb/util/hash.h b/leveldb/util/hash.h new file mode 100644 index 0000000..8889d56 --- /dev/null +++ b/leveldb/util/hash.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Simple hash function used for internal data structures + +#ifndef STORAGE_LEVELDB_UTIL_HASH_H_ +#define STORAGE_LEVELDB_UTIL_HASH_H_ + +#include +#include + +namespace leveldb { + +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +} + +#endif // STORAGE_LEVELDB_UTIL_HASH_H_ diff --git a/leveldb/util/histogram.cc b/leveldb/util/histogram.cc new file mode 100644 index 0000000..c5178ef --- /dev/null +++ b/leveldb/util/histogram.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "port/port.h" +#include "util/histogram.h" + +namespace leveldb { + +const double Histogram::kBucketLimit[kNumBuckets] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, + 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, + 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, + 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, + 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, + 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, + 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, + 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, + 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, + 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, + 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, + 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, + 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, + 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, + 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000, + 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0, + 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0, + 1e200, +}; + +void Histogram::Clear() { + min_ = kBucketLimit[kNumBuckets-1]; + max_ = 0; + num_ = 0; + sum_ = 0; + sum_squares_ = 0; + for (int i = 0; i < kNumBuckets; i++) { + buckets_[i] = 0; + } +} + +void Histogram::Add(double value) { + // Linear search is fast enough for our usage in db_bench + int b = 0; + while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { + b++; + } + buckets_[b] += 1.0; + if (min_ > value) min_ = value; + if (max_ < value) max_ = value; + num_++; + sum_ += value; + sum_squares_ += (value * value); +} + +double Histogram::Median() const { + return Percentile(50.0); +} + +double Histogram::Percentile(double p) const { + double threshold = num_ * (p / 100.0); + double sum = 0; + for (int b = 0; b < kNumBuckets; b++) { + sum += buckets_[b]; + if (sum >= threshold) { + // Scale linearly within this bucket + double left_point = (b == 0) ? 0 : kBucketLimit[b-1]; + double right_point = kBucketLimit[b]; + double left_sum = sum - buckets_[b]; + double right_sum = sum; + double pos = (threshold - left_sum) / (right_sum - left_sum); + double r = left_point + (right_point - left_point) * pos; + if (r < min_) r = min_; + if (r > max_) r = max_; + return r; + } + } + return max_; +} + +double Histogram::Average() const { + if (num_ == 0.0) return 0; + return sum_ / num_; +} + +double Histogram::StandardDeviation() const { + if (num_ == 0.0) return 0; + double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); + return sqrt(variance); +} + +std::string Histogram::ToString() const { + std::string r; + char buf[200]; + snprintf(buf, sizeof(buf), + "Count: %.0f Average: %.4f StdDev: %.2f\n", + num_, Average(), StandardDeviation()); + r.append(buf); + snprintf(buf, sizeof(buf), + "Min: %.4f Median: %.4f Max: %.4f\n", + (num_ == 0.0 ? 0.0 : min_), Median(), max_); + r.append(buf); + r.append("------------------------------------------------------\n"); + const double mult = 100.0 / num_; + double sum = 0; + for (int b = 0; b < kNumBuckets; b++) { + if (buckets_[b] <= 0.0) continue; + sum += buckets_[b]; + snprintf(buf, sizeof(buf), + "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ", + ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left + kBucketLimit[b], // right + buckets_[b], // count + mult * buckets_[b], // percentage + mult * sum); // cumulative percentage + r.append(buf); + + // Add hash marks based on percentage; 20 marks for 100%. + int marks = static_cast(20*(buckets_[b] / num_) + 0.5); + r.append(marks, '#'); + r.push_back('\n'); + } + return r; +} + +} diff --git a/leveldb/util/histogram.h b/leveldb/util/histogram.h new file mode 100644 index 0000000..f72f122 --- /dev/null +++ b/leveldb/util/histogram.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ +#define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ + +#include + +namespace leveldb { + +class Histogram { + public: + Histogram() { } + ~Histogram() { } + + void Clear(); + void Add(double value); + + std::string ToString() const; + + private: + double min_; + double max_; + double num_; + double sum_; + double sum_squares_; + + enum { kNumBuckets = 154 }; + static const double kBucketLimit[kNumBuckets]; + double buckets_[kNumBuckets]; + + double Median() const; + double Percentile(double p) const; + double Average() const; + double StandardDeviation() const; +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ diff --git a/leveldb/util/logging.cc b/leveldb/util/logging.cc new file mode 100644 index 0000000..760d335 --- /dev/null +++ b/leveldb/util/logging.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#include +#include +#include +#include +#include "leveldb/env.h" +#include "leveldb/slice.h" + +namespace leveldb { + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} diff --git a/leveldb/util/logging.h b/leveldb/util/logging.h new file mode 100644 index 0000000..1cd0a4b --- /dev/null +++ b/leveldb/util/logging.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ +#define STORAGE_LEVELDB_UTIL_LOGGING_H_ + +#include +#include +#include +#include "port/port.h" + +namespace leveldb { + +class Slice; +class WritableFile; + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} + +#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/leveldb/util/mutexlock.h b/leveldb/util/mutexlock.h new file mode 100644 index 0000000..05fe279 --- /dev/null +++ b/leveldb/util/mutexlock.h @@ -0,0 +1,39 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ +#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ + +#include "port/port.h" + +namespace leveldb { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { + this->mu_->Lock(); + } + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; + // No copying allowed + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +} + + +#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ diff --git a/leveldb/util/options.cc b/leveldb/util/options.cc new file mode 100644 index 0000000..0ea5c98 --- /dev/null +++ b/leveldb/util/options.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/options.h" + +#include "leveldb/comparator.h" +#include "leveldb/env.h" + +namespace leveldb { + +Options::Options() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(NULL), + write_buffer_size(4<<20), + max_open_files(1000), + block_cache(NULL), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression) { +} + + +} diff --git a/leveldb/util/random.h b/leveldb/util/random.h new file mode 100644 index 0000000..d886b4e --- /dev/null +++ b/leveldb/util/random.h @@ -0,0 +1,59 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ +#define STORAGE_LEVELDB_UTIL_RANDOM_H_ + +#include + +namespace leveldb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/leveldb/util/status.cc b/leveldb/util/status.cc new file mode 100644 index 0000000..d9b7195 --- /dev/null +++ b/leveldb/util/status.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "port/port.h" +#include "leveldb/status.h" + +namespace leveldb { + +Status::Status(Code code, const Slice& msg, const Slice& msg2) { + assert(code != kOk); + state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); + if (!msg2.empty()) { + state_->second.append(": "); + state_->second.append(msg2.data(), msg2.size()); + } +} + +std::string Status::ToString() const { + if (state_ == NULL) { + return "OK"; + } else { + char tmp[30]; + const char* type; + switch (state_->first) { + case kOk: + type = "OK"; + break; + case kNotFound: + type = "NotFound"; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(state_->first)); + type = tmp; + break; + } + std::string result(type); + if (!state_->second.empty()) { + result.append(state_->second); + } + return result; + } +} + +} diff --git a/leveldb/util/testharness.cc b/leveldb/util/testharness.cc new file mode 100644 index 0000000..b686ac3 --- /dev/null +++ b/leveldb/util/testharness.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testharness.h" + +#include +#include + +namespace leveldb { +namespace test { + +namespace { +struct Test { + const char* base; + const char* name; + void (*func)(); +}; +std::vector* tests; +} + +bool RegisterTest(const char* base, const char* name, void (*func)()) { + if (tests == NULL) { + tests = new std::vector; + } + Test t; + t.base = base; + t.name = name; + t.func = func; + tests->push_back(t); + return true; +} + +int RunAllTests() { + int num = 0; + if (tests != NULL) { + for (int i = 0; i < tests->size(); i++) { + const Test& t = (*tests)[i]; + fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); + (*t.func)(); + ++num; + } + } + fprintf(stderr, "==== PASSED %d tests\n", num); + return 0; +} + +std::string TmpDir() { + std::string dir; + Status s = Env::Default()->GetTestDirectory(&dir); + ASSERT_TRUE(s.ok()) << s.ToString(); + return dir; +} + +int RandomSeed() { + const char* env = getenv("TEST_RANDOM_SEED"); + int result = (env != NULL ? atoi(env) : 301); + if (result <= 0) { + result = 301; + } + return result; +} + +} +} diff --git a/leveldb/util/testharness.h b/leveldb/util/testharness.h new file mode 100644 index 0000000..13ab914 --- /dev/null +++ b/leveldb/util/testharness.h @@ -0,0 +1,129 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ +#define STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ + +#include +#include +#include +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "util/random.h" + +namespace leveldb { +namespace test { + +// Run all tests registered by the TEST() macro. +// Returns 0 if all tests pass. +// Dies or returns a non-zero value if some test fails. +extern int RunAllTests(); + +// Return the directory to use for temporary storage. +extern std::string TmpDir(); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +extern int RandomSeed(); + +// An instance of Tester is allocated to hold temporary state during +// the execution of an assertion. +class Tester { + private: + bool ok_; + const char* fname_; + int line_; + std::stringstream ss_; + + public: + Tester(const char* f, int l) + : ok_(true), fname_(f), line_(l) { + } + + ~Tester() { + if (!ok_) { + fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + exit(1); + } + } + + Tester& Is(bool b, const char* msg) { + if (!b) { + ss_ << " Assertion failure " << msg; + ok_ = false; + } + return *this; + } + + Tester& IsOk(const Status& s) { + if (!s.ok()) { + ss_ << " " << s.ToString(); + ok_ = false; + } + return *this; + } + +#define BINARY_OP(name,op) \ + template \ + Tester& name(const X& x, const Y& y) { \ + if (! (x op y)) { \ + ss_ << " failed: " << x << (" " #op " ") << y; \ + ok_ = false; \ + } \ + return *this; \ + } + + BINARY_OP(IsEq, ==) + BINARY_OP(IsNe, !=) + BINARY_OP(IsGe, >=) + BINARY_OP(IsGt, >) + BINARY_OP(IsLe, <=) + BINARY_OP(IsLt, <) +#undef BINARY_OP + + // Attach the specified value to the error message if an error has occurred + template + Tester& operator<<(const V& value) { + if (!ok_) { + ss_ << " " << value; + } + return *this; + } +}; + +#define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c) +#define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) +#define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) +#define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) +#define ASSERT_GT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) +#define ASSERT_LE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) +#define ASSERT_LT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) + +#define TCONCAT(a,b) TCONCAT1(a,b) +#define TCONCAT1(a,b) a##b + +#define TEST(base,name) \ +class TCONCAT(_Test_,name) : public base { \ + public: \ + void _Run(); \ + static void _RunIt() { \ + TCONCAT(_Test_,name) t; \ + t._Run(); \ + } \ +}; \ +bool TCONCAT(_Test_ignored_,name) = \ + ::leveldb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ +void TCONCAT(_Test_,name)::_Run() + +// Register the specified test. Typically not used directly, but +// invoked via the macro expansion of TEST. +extern bool RegisterTest(const char* base, const char* name, void (*func)()); + + +} +} + +#endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ diff --git a/leveldb/util/testutil.cc b/leveldb/util/testutil.cc new file mode 100644 index 0000000..8d6cf3c --- /dev/null +++ b/leveldb/util/testutil.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testutil.h" + +#include "util/random.h" + +namespace leveldb { +namespace test { + +Slice RandomString(Random* rnd, int len, std::string* dst) { + dst->resize(len); + for (int i = 0; i < len; i++) { + (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' + } + return Slice(*dst); +} + +std::string RandomKey(Random* rnd, int len) { + // Make sure to generate a wide variety of characters so we + // test the boundary conditions for short-key optimizations. + static const char kTestChars[] = { + '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' + }; + std::string result; + for (int i = 0; i < len; i++) { + result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; + } + return result; +} + + +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data; + RandomString(rnd, raw, &raw_data); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + +} +} diff --git a/leveldb/util/testutil.h b/leveldb/util/testutil.h new file mode 100644 index 0000000..a150c1a --- /dev/null +++ b/leveldb/util/testutil.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_ +#define STORAGE_LEVELDB_UTIL_TESTUTIL_H_ + +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "util/random.h" + +namespace leveldb { +namespace test { + +// Store in *dst a random string of length "len" and return a Slice that +// references the generated data. +extern Slice RandomString(Random* rnd, int len, std::string* dst); + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +extern std::string RandomKey(Random* rnd, int len); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + ErrorEnv() : EnvWrapper(Env::Default()), + writable_file_error_(false), + num_writable_file_errors_(0) { } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + if (writable_file_error_) { + ++num_writable_file_errors_; + *result = NULL; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result); + } +}; + +} +} + +#endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_ diff --git a/port/README b/port/README deleted file mode 100644 index 422563e..0000000 --- a/port/README +++ /dev/null @@ -1,10 +0,0 @@ -This directory contains interfaces and implementations that isolate the -rest of the package from platform details. - -Code in the rest of the package includes "port.h" from this directory. -"port.h" in turn includes a platform specific "port_.h" file -that provides the platform specific implementation. - -See port_posix.h for an example of what must be provided in a platform -specific header file. - diff --git a/port/port.h b/port/port.h deleted file mode 100644 index 816826b..0000000 --- a/port/port.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_PORT_PORT_H_ -#define STORAGE_LEVELDB_PORT_PORT_H_ - -#include - -// Include the appropriate platform specific file below. If you are -// porting to a new platform, see "port_example.h" for documentation -// of what the new port_.h file must provide. -#if defined(LEVELDB_PLATFORM_POSIX) -# include "port/port_posix.h" -#elif defined(LEVELDB_PLATFORM_CHROMIUM) -# include "port/port_chromium.h" -#elif defined(LEVELDB_PLATFORM_ANDROID) -# include "port/port_android.h" -#endif - -#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_android.cc b/port/port_android.cc deleted file mode 100644 index 240e9ca..0000000 --- a/port/port_android.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_android.h" - -#include - -extern "C" { -size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { - return fread(a, b, c, d); -} - -size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { - return fwrite(a, b, c, d); -} - -int fflush_unlocked(FILE *f) { - return fflush(f); -} - -int fdatasync(int fd) { - return fsync(fd); -} -} - -namespace leveldb { -namespace port { - -static void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } -} - -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } -void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } -void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } - -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); -} - -CondVar::~CondVar() { - PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); -} - -void CondVar::Wait() { - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); -} - -void CondVar::Signal(){ - PthreadCall("signal", pthread_cond_signal(&cv_)); -} - -void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); -} - -} -} diff --git a/port/port_android.h b/port/port_android.h deleted file mode 100644 index 8680951..0000000 --- a/port/port_android.h +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ -#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ - -#include -#include -#include -#include -#include -#include -#include - -// Collapse the plethora of ARM flavors available to an easier to manage set -// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto -#if defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || \ - defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6Z__) || \ - defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7A__) -#define ARMV6_OR_7 1 -#endif - -extern "C" { - size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); - size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); - int fflush_unlocked(FILE *f); - int fdatasync (int fd); -} - -namespace leveldb { -namespace port { - -static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; - -class CondVar; - -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld() { - //TODO(gabor): How can I implement this? - } - - private: - friend class CondVar; - pthread_mutex_t mu_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - private: - Mutex* mu_; - pthread_cond_t cv_; -}; - -#ifndef ARMV6_OR_7 -// On ARM chipsets = V6 -#ifdef ARMV6_OR_7 - __asm__ __volatile__("dmb" : : : "memory"); -#else - pLinuxKernelMemoryBarrier(); -#endif - } - - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - void* r = rep_; - MemoryBarrier(); - return r; - } - inline void Release_Store(void* v) { - MemoryBarrier(); - rep_ = v; - } - inline void* NoBarrier_Load() const { - void* r = rep_; - return r; - } - inline void NoBarrier_Store(void* v) { - rep_ = v; - } -}; - -// TODO(gabor): Implement compress -inline bool Snappy_Compress( - const char* input, - size_t input_length, - std::string* output) { - return false; -} - -// TODO(gabor): Implement uncompress -inline bool Snappy_Uncompress( - const char* input_data, - size_t input_length, - std::string* output) { - return false; -} - -inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { - SHA1_CTX sha1_ctx; - SHA1Init(&sha1_ctx); - SHA1Update(&sha1_ctx, (const u_char*)data, len); - SHA1Final((u_char*)hash_array, &sha1_ctx); -} - -inline uint64_t ThreadIdentifier() { - pthread_t tid = pthread_self(); - uint64_t r = 0; - memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); - return r; -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/port/port_chromium.cc b/port/port_chromium.cc deleted file mode 100644 index 2ab49b9..0000000 --- a/port/port_chromium.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_chromium.h" - -#include "util/logging.h" - -#if defined(USE_SNAPPY) -# include "third_party/snappy/src/snappy.h" -#endif - -namespace leveldb { -namespace port { - -Mutex::Mutex() { -} - -Mutex::~Mutex() { -} - -void Mutex::Lock() { - mu_.Acquire(); -} - -void Mutex::Unlock() { - mu_.Release(); -} - -void Mutex::AssertHeld() { - mu_.AssertAcquired(); -} - -CondVar::CondVar(Mutex* mu) - : cv_(&mu->mu_) { -} - -CondVar::~CondVar() { } - -void CondVar::Wait() { - cv_.Wait(); -} - -void CondVar::Signal(){ - cv_.Signal(); -} - -void CondVar::SignalAll() { - cv_.Broadcast(); -} - -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { -#if defined(USE_SNAPPY) - output->resize(snappy::MaxCompressedLength(input_length)); - size_t outlen; - snappy::RawCompress(input, input_length, &(*output)[0], &outlen); - output->resize(outlen); - return true; -#else - return false; -#endif -} - -bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { -#if defined(USE_SNAPPY) - size_t ulength; - if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { - return false; - } - output->resize(ulength); - return snappy::RawUncompress(input_data, input_length, &(*output)[0]); -#else - return false; -#endif -} - -} -} diff --git a/port/port_chromium.h b/port/port_chromium.h deleted file mode 100644 index e349f8f..0000000 --- a/port/port_chromium.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ -#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ - -#include -#include -#include -#include "base/atomicops.h" -#include "base/basictypes.h" -#include "base/logging.h" -#include "base/sha1.h" -#include "base/synchronization/condition_variable.h" -#include "base/synchronization/lock.h" - -// Linux's ThreadIdentifier() needs this. -#if defined(OS_LINUX) -# include -#endif - -#if defined(OS_WIN) -#define snprintf _snprintf -#define va_copy(a, b) do { (a) = (b); } while (0) -#endif - -namespace leveldb { -namespace port { - -// Chromium only supports little endian. -static const bool kLittleEndian = true; - -class Mutex { - public: - Mutex(); - ~Mutex(); - void Lock(); - void Unlock(); - void AssertHeld(); - - private: - base::Lock mu_; - - friend class CondVar; - DISALLOW_COPY_AND_ASSIGN(Mutex); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - - private: - base::ConditionVariable cv_; - - DISALLOW_COPY_AND_ASSIGN(CondVar); -}; - -class AtomicPointer { - private: - typedef base::subtle::AtomicWord Rep; - Rep rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} - inline void* Acquire_Load() const { - return reinterpret_cast(::base::subtle::Acquire_Load(&rep_)); - } - inline void Release_Store(void* v) { - ::base::subtle::Release_Store(&rep_, reinterpret_cast(v)); - } - inline void* NoBarrier_Load() const { - return reinterpret_cast(::base::subtle::NoBarrier_Load(&rep_)); - } - inline void NoBarrier_Store(void* v) { - ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast(v)); - } -}; - -inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { - return ::base::SHA1HashBytes(reinterpret_cast(data), - len, - reinterpret_cast(hash_array)); -} - -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); -bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output); - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ diff --git a/port/port_example.h b/port/port_example.h deleted file mode 100644 index cf72617..0000000 --- a/port/port_example.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// This file contains the specification, but not the implementations, -// of the types/operations/etc. that should be defined by a platform -// specific port_.h file. Use this file as a reference for -// how to port this package to a new platform. - -#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ -#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ - -namespace leveldb { -namespace port { - -// TODO(jorlow): Many of these belong more in the environment class rather than -// here. We should try moving them and see if it affects perf. - -// The following boolean constant must be true on a little-endian machine -// and false otherwise. -static const bool kLittleEndian = true /* or some other expression */; - -// ------------------ Threading ------------------- - -// A Mutex represents an exclusive lock. -class Mutex { - public: - Mutex(); - ~Mutex(); - - // Lock the mutex. Waits until other lockers have exited. - // Will deadlock if the mutex is already locked by this thread. - void Lock(); - - // Unlock the mutex. - // REQUIRES: This mutex was locked by this thread. - void Unlock(); - - // Optionally crash if this thread does not hold this mutex. - // The implementation must be fast, especially if NDEBUG is - // defined. The implementation is allowed to skip all checks. - void AssertHeld(); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - - // Atomically release *mu and block on this condition variable until - // either a call to SignalAll(), or a call to Signal() that picks - // this thread to wakeup. - // REQUIRES: this thread holds *mu - void Wait(); - - // If there are some threads waiting, wake up at least one of them. - void Signal(); - - // Wake up all waiting threads. - void SignallAll(); -}; - -// A type that holds a pointer that can be read or written atomically -// (i.e., without word-tearing.) -class AtomicPointer { - private: - intptr_t rep_; - public: - // Initialize to arbitrary value - AtomicPointer(); - - // Initialize to hold v - explicit AtomicPointer(void* v) : rep_(v) { } - - // Read and return the stored pointer with the guarantee that no - // later memory access (read or write) by this thread can be - // reordered ahead of this read. - void* Acquire_Load() const; - - // Set v as the stored pointer with the guarantee that no earlier - // memory access (read or write) by this thread can be reordered - // after this store. - void Release_Store(void* v); - - // Read the stored pointer with no ordering guarantees. - void* NoBarrier_Load() const; - - // Set va as the stored pointer with no ordering guarantees. - void NoBarrier_Store(void* v); -}; - -// ------------------ Checksumming ------------------- - -// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]" -extern void SHA1_Hash(const char* data, size_t len, char* hash_array); - -// ------------------ Compression ------------------- - -// Store the snappy compression of "input[0,input_length-1]" in *output. -// Returns false if snappy is not supported by this port. -extern bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); - -// Attempt to snappy uncompress input[0,input_length-1] into *output. -// Returns true if successful, false if the input is invalid lightweight -// compressed data. -extern bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output); - -// ------------------ Miscellaneous ------------------- - -// If heap profiling is not supported, returns false. -// Else repeatedly calls (*func)(arg, data, n) and then returns true. -// The concatenation of all "data[0,n-1]" fragments is the heap profile. -extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/port/port_posix.cc b/port/port_posix.cc deleted file mode 100644 index e75da8b..0000000 --- a/port/port_posix.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_posix.h" - -#include -#include -#include -#include "util/logging.h" - -namespace leveldb { -namespace port { - -static void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } -} - -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } - -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } - -void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } - -void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } - -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); -} - -CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } - -void CondVar::Wait() { - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); -} - -void CondVar::Signal() { - PthreadCall("signal", pthread_cond_signal(&cv_)); -} - -void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); -} - -} -} diff --git a/port/port_posix.h b/port/port_posix.h deleted file mode 100644 index 7adbc01..0000000 --- a/port/port_posix.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ -#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ - -#include -#include -#include -#include -#include -#include -#include "port/sha1_portable.h" - -namespace leveldb { -namespace port { - -static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); - -class CondVar; - -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld() { } - - private: - friend class CondVar; - pthread_mutex_t mu_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - private: - pthread_cond_t cv_; - Mutex* mu_; -}; - -// Storage for a lock-free pointer -class AtomicPointer { - private: - std::atomic rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); - } - inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); - } - inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); - } - inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); - } -}; - -inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { - SHA1_Hash_Portable(data, len, hash_array); -} - -// TODO(gabor): Implement actual compress -inline bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { - return false; -} - -// TODO(gabor): Implement actual uncompress -inline bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { - return false; -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc deleted file mode 100644 index 8fa7277..0000000 --- a/port/sha1_portable.cc +++ /dev/null @@ -1,298 +0,0 @@ -// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// This module provides a slow but portable implementation of -// the SHA1 hash function. -// -// It is adapted from free code written by Paul E. Jones -// . See http://www.packetizer.com/security/sha1/ -// -// The license for the original code is: -/* - Copyright (C) 1998, 2009 - Paul E. Jones - - Freeware Public License (FPL) - - This software is licensed as "freeware." Permission to distribute - this software in source and binary forms, including incorporation - into other products, is hereby granted without a fee. THIS SOFTWARE - IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES, - INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY - AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD - LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER - DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA - OR DATA BEING RENDERED INACCURATE. -*/ - -#include "port/sha1_portable.h" -#include -#include -#include - -namespace leveldb { -namespace port { - -/* - * Description: - * This class implements the Secure Hashing Standard as defined - * in FIPS PUB 180-1 published April 17, 1995. - */ - -/* - * This structure will hold context information for the hashing - * operation - */ -typedef struct SHA1Context { - unsigned Message_Digest[5]; /* Message Digest (output) */ - - unsigned Length_Low; /* Message length in bits */ - unsigned Length_High; /* Message length in bits */ - - unsigned char Message_Block[64]; /* 512-bit message blocks */ - int Message_Block_Index; /* Index into message block array */ - - bool Computed; /* Is the digest computed? */ - bool Corrupted; /* Is the message digest corruped? */ -} SHA1Context; - -/* - * Portability Issues: - * SHA-1 is defined in terms of 32-bit "words". This code was - * written with the expectation that the processor has at least - * a 32-bit machine word size. If the machine word size is larger, - * the code should still function properly. One caveat to that - * is that the input functions taking characters and character - * arrays assume that only 8 bits of information are stored in each - * character. - */ - -/* - * Define the circular shift macro - */ -#define SHA1CircularShift(bits,word) \ - ((((word) << (bits)) & 0xFFFFFFFF) | \ - ((word) >> (32-(bits)))) - -/* Function prototypes */ -static void SHA1ProcessMessageBlock(SHA1Context *); -static void SHA1PadMessage(SHA1Context *); - -// Initialize the SHA1Context in preparation for computing a new -// message digest. -static void SHA1Reset(SHA1Context* context) { - context->Length_Low = 0; - context->Length_High = 0; - context->Message_Block_Index = 0; - - context->Message_Digest[0] = 0x67452301; - context->Message_Digest[1] = 0xEFCDAB89; - context->Message_Digest[2] = 0x98BADCFE; - context->Message_Digest[3] = 0x10325476; - context->Message_Digest[4] = 0xC3D2E1F0; - - context->Computed = false; - context->Corrupted = false; -} - -// This function will return the 160-bit message digest into the -// Message_Digest array within the SHA1Context provided -static bool SHA1Result(SHA1Context *context) { - if (context->Corrupted) { - return false; - } - - if (!context->Computed) { - SHA1PadMessage(context); - context->Computed = true; - } - return true; -} - -// This function accepts an array of bytes as the next portion of -// the message. -static void SHA1Input(SHA1Context *context, - const unsigned char *message_array, - unsigned length) { - if (!length) return; - - if (context->Computed || context->Corrupted) { - context->Corrupted = true; - return; - } - - while(length-- && !context->Corrupted) { - context->Message_Block[context->Message_Block_Index++] = - (*message_array & 0xFF); - - context->Length_Low += 8; - /* Force it to 32 bits */ - context->Length_Low &= 0xFFFFFFFF; - if (context->Length_Low == 0) { - context->Length_High++; - /* Force it to 32 bits */ - context->Length_High &= 0xFFFFFFFF; - if (context->Length_High == 0) - { - /* Message is too long */ - context->Corrupted = true; - } - } - - if (context->Message_Block_Index == 64) - { - SHA1ProcessMessageBlock(context); - } - - message_array++; - } -} - -// This function will process the next 512 bits of the message stored -// in the Message_Block array. -static void SHA1ProcessMessageBlock(SHA1Context *context) { - const unsigned K[] = // Constants defined in SHA-1 - { - 0x5A827999, - 0x6ED9EBA1, - 0x8F1BBCDC, - 0xCA62C1D6 - }; - int t; // Loop counter - unsigned temp; // Temporary word value - unsigned W[80]; // Word sequence - unsigned A, B, C, D, E; // Word buffers - - // Initialize the first 16 words in the array W - for(t = 0; t < 16; t++) { - W[t] = ((unsigned) context->Message_Block[t * 4]) << 24; - W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16; - W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8; - W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]); - } - - for(t = 16; t < 80; t++) { - W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]); - } - - A = context->Message_Digest[0]; - B = context->Message_Digest[1]; - C = context->Message_Digest[2]; - D = context->Message_Digest[3]; - E = context->Message_Digest[4]; - - for(t = 0; t < 20; t++) { - temp = SHA1CircularShift(5,A) + - ((B & C) | ((~B) & D)) + E + W[t] + K[0]; - temp &= 0xFFFFFFFF; - E = D; - D = C; - C = SHA1CircularShift(30,B); - B = A; - A = temp; - } - - for(t = 20; t < 40; t++) { - temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1]; - temp &= 0xFFFFFFFF; - E = D; - D = C; - C = SHA1CircularShift(30,B); - B = A; - A = temp; - } - - for(t = 40; t < 60; t++) { - temp = SHA1CircularShift(5,A) + - ((B & C) | (B & D) | (C & D)) + E + W[t] + K[2]; - temp &= 0xFFFFFFFF; - E = D; - D = C; - C = SHA1CircularShift(30,B); - B = A; - A = temp; - } - - for(t = 60; t < 80; t++) { - temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3]; - temp &= 0xFFFFFFFF; - E = D; - D = C; - C = SHA1CircularShift(30,B); - B = A; - A = temp; - } - - context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF; - context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF; - context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF; - context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF; - context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF; - - context->Message_Block_Index = 0; -} - -// According to the standard, the message must be padded to an even -// 512 bits. The first padding bit must be a '1'. The last 64 bits -// represent the length of the original message. All bits in between -// should be 0. This function will pad the message according to those -// rules by filling the Message_Block array accordingly. It will also -// call SHA1ProcessMessageBlock() appropriately. When it returns, it -// can be assumed that the message digest has been computed. -static void SHA1PadMessage(SHA1Context *context) { - // Check to see if the current message block is too small to hold - // the initial padding bits and length. If so, we will pad the - // block, process it, and then continue padding into a second block. - if (context->Message_Block_Index > 55) { - context->Message_Block[context->Message_Block_Index++] = 0x80; - while(context->Message_Block_Index < 64) { - context->Message_Block[context->Message_Block_Index++] = 0; - } - - SHA1ProcessMessageBlock(context); - - while(context->Message_Block_Index < 56) { - context->Message_Block[context->Message_Block_Index++] = 0; - } - } else { - context->Message_Block[context->Message_Block_Index++] = 0x80; - while(context->Message_Block_Index < 56) { - context->Message_Block[context->Message_Block_Index++] = 0; - } - } - - // Store the message length as the last 8 octets - context->Message_Block[56] = (context->Length_High >> 24) & 0xFF; - context->Message_Block[57] = (context->Length_High >> 16) & 0xFF; - context->Message_Block[58] = (context->Length_High >> 8) & 0xFF; - context->Message_Block[59] = (context->Length_High) & 0xFF; - context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF; - context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF; - context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF; - context->Message_Block[63] = (context->Length_Low) & 0xFF; - - SHA1ProcessMessageBlock(context); -} - - -void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) { - SHA1Context context; - SHA1Reset(&context); - SHA1Input(&context, reinterpret_cast(data), len); - bool ok = SHA1Result(&context); - if (!ok) { - fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n"); - exit(1); - } - for (int i = 0; i < 5; i++) { - uint32_t value = context.Message_Digest[i]; - hash_array[i*4 + 0] = (value >> 24) & 0xff; - hash_array[i*4 + 1] = (value >> 16) & 0xff; - hash_array[i*4 + 2] = (value >> 8) & 0xff; - hash_array[i*4 + 3] = value & 0xff; - } -} - -} -} diff --git a/port/sha1_portable.h b/port/sha1_portable.h deleted file mode 100644 index 31db305..0000000 --- a/port/sha1_portable.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ -#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ - -#include - -namespace leveldb { -namespace port { - -// Compute the SHA1 hash value of "data[0..len-1]" and store it in -// "hash_array[0..19]". hash_array must have 20 bytes of space available. -// -// This function is portable but may not be as fast as a version -// optimized for your platform. It is provided as a default method -// that can be used when porting leveldb to a new platform if no -// better SHA1 hash implementation is available. -void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array); - -} -} - -#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ diff --git a/port/sha1_test.cc b/port/sha1_test.cc deleted file mode 100644 index b182e67..0000000 --- a/port/sha1_test.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port.h" -#include "util/testharness.h" - -namespace leveldb { -namespace port { - -class SHA1 { }; - -static std::string TestSHA1(const char* data, size_t len) { - char hash_val[20]; - SHA1_Hash(data, len, hash_val); - char buf[41]; - for (int i = 0; i < 20; i++) { - snprintf(buf + i * 2, 41 - i * 2, - "%02x", - static_cast(static_cast( - hash_val[i]))); - } - return std::string(buf, 40); -} - -TEST(SHA1, Simple) { - ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0)); - ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5)); - std::string x(10000, 'x'); - ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75", - TestSHA1(x.data(), x.size())); -} - -} -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/port/win/stdint.h b/port/win/stdint.h deleted file mode 100644 index 39edd0d..0000000 --- a/port/win/stdint.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// MSVC didn't ship with this file until the 2010 version. - -#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ -#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ - -#if !defined(_MSC_VER) -#error This file should only be included when compiling with MSVC. -#endif - -// Define C99 equivalent types. -typedef signed char int8_t; -typedef signed short int16_t; -typedef signed int int32_t; -typedef signed long long int64_t; -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/table/block.cc b/table/block.cc deleted file mode 100644 index 0525d2d..0000000 --- a/table/block.cc +++ /dev/null @@ -1,261 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Decodes the blocks generated by block_builder.cc. - -#include "table/block.h" - -#include -#include -#include "leveldb/comparator.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -inline uint32_t Block::NumRestarts() const { - assert(size_ >= 2*sizeof(uint32_t)); - return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); -} - -Block::Block(const char* data, size_t size) - : data_(data), - size_(size) { - if (size_ < sizeof(uint32_t)) { - size_ = 0; // Error marker - } else { - restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); - if (restart_offset_ > size_ - sizeof(uint32_t)) { - // The size is too small for NumRestarts() and therefore - // restart_offset_ wrapped around. - size_ = 0; - } - } -} - -Block::~Block() { - delete[] data_; -} - -// Helper routine: decode the next block entry starting at "p", -// storing the number of shared key bytes, non_shared key bytes, -// and the length of the value in "*shared", "*non_shared", and -// "*value_length", respectively. Will not derefence past "limit". -// -// If any errors are detected, returns NULL. Otherwise, returns a -// pointer to the key delta (just past the three decoded values). -static inline const char* DecodeEntry(const char* p, const char* limit, - uint32_t* shared, - uint32_t* non_shared, - uint32_t* value_length) { - if (limit - p < 3) return NULL; - *shared = reinterpret_cast(p)[0]; - *non_shared = reinterpret_cast(p)[1]; - *value_length = reinterpret_cast(p)[2]; - if ((*shared | *non_shared | *value_length) < 128) { - // Fast path: all three values are encoded in one byte each - p += 3; - } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; - if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; - } - - if (limit - p < (*non_shared + *value_length)) return NULL; - return p; -} - -class Block::Iter : public Iterator { - private: - const Comparator* const comparator_; - const char* const data_; // underlying block contents - uint32_t const restarts_; // Offset of restart array (list of fixed32) - uint32_t const num_restarts_; // Number of uint32_t entries in restart array - - // current_ is offset in data_ of current entry. >= restarts_ if !Valid - uint32_t current_; - uint32_t restart_index_; // Index of restart block in which current_ falls - std::string key_; - Slice value_; - Status status_; - - inline int Compare(const Slice& a, const Slice& b) const { - return comparator_->Compare(a, b); - } - - // Return the offset in data_ just past the end of the current entry. - inline uint32_t NextEntryOffset() const { - return (value_.data() + value_.size()) - data_; - } - - uint32_t GetRestartPoint(uint32_t index) { - assert(index < num_restarts_); - return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); - } - - void SeekToRestartPoint(uint32_t index) { - key_.clear(); - restart_index_ = index; - // current_ will be fixed by ParseNextKey(); - - // ParseNextKey() starts at the end of value_, so set value_ accordingly - uint32_t offset = GetRestartPoint(index); - value_ = Slice(data_ + offset, 0); - } - - public: - Iter(const Comparator* comparator, - const char* data, - uint32_t restarts, - uint32_t num_restarts) - : comparator_(comparator), - data_(data), - restarts_(restarts), - num_restarts_(num_restarts), - current_(restarts_), - restart_index_(num_restarts_) { - assert(num_restarts_ > 0); - } - - virtual bool Valid() const { return current_ < restarts_; } - virtual Status status() const { return status_; } - virtual Slice key() const { - assert(Valid()); - return key_; - } - virtual Slice value() const { - assert(Valid()); - return value_; - } - - virtual void Next() { - assert(Valid()); - ParseNextKey(); - } - - virtual void Prev() { - assert(Valid()); - - // Scan backwards to a restart point before current_ - const uint32_t original = current_; - while (GetRestartPoint(restart_index_) >= original) { - if (restart_index_ == 0) { - // No more entries - current_ = restarts_; - restart_index_ = num_restarts_; - return; - } - restart_index_--; - } - - SeekToRestartPoint(restart_index_); - do { - // Loop until end of current entry hits the start of original entry - } while (ParseNextKey() && NextEntryOffset() < original); - } - - virtual void Seek(const Slice& target) { - // Binary search in restart array to find the first restart point - // with a key >= target - uint32_t left = 0; - uint32_t right = num_restarts_ - 1; - while (left < right) { - uint32_t mid = (left + right + 1) / 2; - uint32_t region_offset = GetRestartPoint(mid); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, - data_ + restarts_, - &shared, &non_shared, &value_length); - if (key_ptr == NULL || (shared != 0)) { - CorruptionError(); - return; - } - Slice mid_key(key_ptr, non_shared); - if (Compare(mid_key, target) < 0) { - // Key at "mid" is smaller than "target". Therefore all - // blocks before "mid" are uninteresting. - left = mid; - } else { - // Key at "mid" is >= "target". Therefore all blocks at or - // after "mid" are uninteresting. - right = mid - 1; - } - } - - // Linear search (within restart block) for first key >= target - SeekToRestartPoint(left); - while (true) { - if (!ParseNextKey()) { - return; - } - if (Compare(key_, target) >= 0) { - return; - } - } - } - - virtual void SeekToFirst() { - SeekToRestartPoint(0); - ParseNextKey(); - } - - virtual void SeekToLast() { - SeekToRestartPoint(num_restarts_ - 1); - while (ParseNextKey() && NextEntryOffset() < restarts_) { - // Keep skipping - } - } - - private: - void CorruptionError() { - current_ = restarts_; - restart_index_ = num_restarts_; - status_ = Status::Corruption("bad entry in block"); - key_.clear(); - value_.clear(); - } - - bool ParseNextKey() { - current_ = NextEntryOffset(); - const char* p = data_ + current_; - const char* limit = data_ + restarts_; // Restarts come right after data - if (p >= limit) { - // No more entries to return. Mark as invalid. - current_ = restarts_; - restart_index_ = num_restarts_; - return false; - } - - // Decode next entry - uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); - if (p == NULL || key_.size() < shared) { - CorruptionError(); - return false; - } else { - key_.resize(shared); - key_.append(p, non_shared); - value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; - } - return true; - } - } -}; - -Iterator* Block::NewIterator(const Comparator* cmp) { - if (size_ < 2*sizeof(uint32_t)) { - return NewErrorIterator(Status::Corruption("bad block contents")); - } - const uint32_t num_restarts = NumRestarts(); - if (num_restarts == 0) { - return NewEmptyIterator(); - } else { - return new Iter(cmp, data_, restart_offset_, num_restarts); - } -} - -} diff --git a/table/block.h b/table/block.h deleted file mode 100644 index cdf0598..0000000 --- a/table/block.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ -#define STORAGE_LEVELDB_TABLE_BLOCK_H_ - -#include -#include -#include "leveldb/iterator.h" - -namespace leveldb { - -class Comparator; - -class Block { - public: - // Initialize the block with the specified contents. - // Takes ownership of data[] and will delete[] it when done. - Block(const char* data, size_t size); - - ~Block(); - - size_t size() const { return size_; } - Iterator* NewIterator(const Comparator* comparator); - - private: - uint32_t NumRestarts() const; - - const char* data_; - size_t size_; - uint32_t restart_offset_; // Offset in data_ of restart array - - // No copying allowed - Block(const Block&); - void operator=(const Block&); - - class Iter; -}; - -} - -#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ diff --git a/table/block_builder.cc b/table/block_builder.cc deleted file mode 100644 index ae18b36..0000000 --- a/table/block_builder.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// BlockBuilder generates blocks where keys are prefix-compressed: -// -// When we store a key, we drop the prefix shared with the previous -// string. This helps reduce the space requirement significantly. -// Furthermore, once every K keys, we do not apply the prefix -// compression and store the entire key. We call this a "restart -// point". The tail end of the block stores the offsets of all of the -// restart points, and can be used to do a binary search when looking -// for a particular key. Values are stored as-is (without compression) -// immediately following the corresponding key. -// -// An entry for a particular key-value pair has the form: -// shared_bytes: varint32 -// unshared_bytes: varint32 -// value_length: varint32 -// key_delta: char[unshared_bytes] -// value: char[value_length] -// shared_bytes == 0 for restart points. -// -// The trailer of the block has the form: -// restarts: uint32[num_restarts] -// num_restarts: uint32 -// restarts[i] contains the offset within the block of the ith restart point. - -#include "table/block_builder.h" - -#include -#include -#include "leveldb/comparator.h" -#include "leveldb/table_builder.h" -#include "util/coding.h" - -namespace leveldb { - -BlockBuilder::BlockBuilder(const Options* options) - : options_(options), - restarts_(), - counter_(0), - finished_(false) { - assert(options->block_restart_interval >= 1); - restarts_.push_back(0); // First restart point is at offset 0 -} - -void BlockBuilder::Reset() { - buffer_.clear(); - restarts_.clear(); - restarts_.push_back(0); // First restart point is at offset 0 - counter_ = 0; - finished_ = false; - last_key_.clear(); -} - -size_t BlockBuilder::CurrentSizeEstimate() const { - return (buffer_.size() + // Raw data buffer - restarts_.size() * sizeof(uint32_t) + // Restart array - sizeof(uint32_t)); // Restart array length -} - -Slice BlockBuilder::Finish() { - // Append restart array - for (int i = 0; i < restarts_.size(); i++) { - PutFixed32(&buffer_, restarts_[i]); - } - PutFixed32(&buffer_, restarts_.size()); - finished_ = true; - return Slice(buffer_); -} - -void BlockBuilder::Add(const Slice& key, const Slice& value) { - Slice last_key_piece(last_key_); - assert(!finished_); - assert(counter_ <= options_->block_restart_interval); - assert(buffer_.empty() // No values yet? - || options_->comparator->Compare(key, last_key_piece) > 0); - size_t shared = 0; - if (counter_ < options_->block_restart_interval) { - // See how much sharing to do with previous string - const size_t min_length = std::min(last_key_piece.size(), key.size()); - while ((shared < min_length) && (last_key_[shared] == key[shared])) { - shared++; - } - } else { - // Restart compression - restarts_.push_back(buffer_.size()); - counter_ = 0; - } - const size_t non_shared = key.size() - shared; - - // Add "" to buffer_ - PutVarint32(&buffer_, shared); - PutVarint32(&buffer_, non_shared); - PutVarint32(&buffer_, value.size()); - - // Add string delta to buffer_ followed by value - buffer_.append(key.data() + shared, non_shared); - buffer_.append(value.data(), value.size()); - - // Update state - last_key_.resize(shared); - last_key_.append(key.data() + shared, non_shared); - assert(Slice(last_key_) == key); - counter_++; -} - -} diff --git a/table/block_builder.h b/table/block_builder.h deleted file mode 100644 index bf92a0f..0000000 --- a/table/block_builder.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ -#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ - -#include - -#include -#include "leveldb/slice.h" - -namespace leveldb { - -struct Options; - -class BlockBuilder { - public: - explicit BlockBuilder(const Options* options); - - // Reset the contents as if the BlockBuilder was just constructed. - void Reset(); - - // REQUIRES: Finish() has not been callled since the last call to Reset(). - // REQUIRES: key is larger than any previously added key - void Add(const Slice& key, const Slice& value); - - // Finish building the block and return a slice that refers to the - // block contents. The returned slice will remain valid for the - // lifetime of this builder or until Reset() is called. - Slice Finish(); - - // Returns an estimate of the current (uncompressed) size of the block - // we are building. - size_t CurrentSizeEstimate() const; - - // Return true iff no entries have been added since the last Reset() - bool empty() const { - return buffer_.empty(); - } - - private: - const Options* options_; - std::string buffer_; // Destination buffer - std::vector restarts_; // Restart points - int counter_; // Number of entries emitted since restart - bool finished_; // Has Finish() been called? - std::string last_key_; - - // No copying allowed - BlockBuilder(const BlockBuilder&); - void operator=(const BlockBuilder&); -}; - -} - -#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ diff --git a/table/format.cc b/table/format.cc deleted file mode 100644 index 8c6b0f3..0000000 --- a/table/format.cc +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/format.h" - -#include "leveldb/env.h" -#include "port/port.h" -#include "table/block.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { - -void BlockHandle::EncodeTo(std::string* dst) const { - // Sanity check that all fields have been set - assert(offset_ != ~static_cast(0)); - assert(size_ != ~static_cast(0)); - PutVarint64(dst, offset_); - PutVarint64(dst, size_); -} - -Status BlockHandle::DecodeFrom(Slice* input) { - if (GetVarint64(input, &offset_) && - GetVarint64(input, &size_)) { - return Status::OK(); - } else { - return Status::Corruption("bad block handle"); - } -} - -void Footer::EncodeTo(std::string* dst) const { -#ifndef NDEBUG - const size_t original_size = dst->size(); -#endif - metaindex_handle_.EncodeTo(dst); - index_handle_.EncodeTo(dst); - dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding - PutFixed32(dst, static_cast(kTableMagicNumber)); - PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); - assert(dst->size() == original_size + kEncodedLength); -} - -Status Footer::DecodeFrom(Slice* input) { - const char* magic_ptr = input->data() + kEncodedLength - 8; - const uint32_t magic_lo = DecodeFixed32(magic_ptr); - const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); - const uint64_t magic = ((static_cast(magic_hi) << 32) | - (static_cast(magic_lo))); - if (magic != kTableMagicNumber) { - return Status::InvalidArgument("not an sstable (bad magic number)"); - } - - Status result = metaindex_handle_.DecodeFrom(input); - if (result.ok()) { - result = index_handle_.DecodeFrom(input); - } - if (result.ok()) { - // We skip over any leftover data (just padding for now) in "input" - const char* end = magic_ptr + 8; - *input = Slice(end, input->data() + input->size() - end); - } - return result; -} - -Status ReadBlock(RandomAccessFile* file, - const ReadOptions& options, - const BlockHandle& handle, - Block** block) { - *block = NULL; - - // Read the block contents as well as the type/crc footer. - // See table_builder.cc for the code that built this structure. - size_t n = handle.size(); - char* buf = new char[n + kBlockTrailerSize]; - Slice contents; - Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); - if (!s.ok()) { - delete[] buf; - return s; - } - if (contents.size() != n + kBlockTrailerSize) { - delete[] buf; - return Status::Corruption("truncated block read"); - } - - // Check the crc of the type and the block contents - const char* data = contents.data(); // Pointer to where Read put the data - if (options.verify_checksums) { - const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); - const uint32_t actual = crc32c::Value(data, n + 1); - if (actual != crc) { - delete[] buf; - s = Status::Corruption("block checksum mismatch"); - return s; - } - } - - switch (data[n]) { - case kNoCompression: - if (data != buf) { - // File implementation gave us pointer to some other data. - // Copy into buf[]. - memcpy(buf, data, n + kBlockTrailerSize); - } - - // Ok - break; - case kSnappyCompression: { - std::string decompressed; - if (!port::Snappy_Uncompress(data, n, &decompressed)) { - delete[] buf; - s = Status::Corruption("corrupted compressed block contents"); - return s; - } - delete[] buf; // Done with uncompressed data - buf = new char[decompressed.size()]; - memcpy(buf, decompressed.data(), decompressed.size()); - n = decompressed.size(); - break; - } - default: - delete[] buf; - return Status::Corruption("bad block type"); - } - - *block = new Block(buf, n); // Block takes ownership of buf[] - return Status::OK(); -} - -} diff --git a/table/format.h b/table/format.h deleted file mode 100644 index a6ab964..0000000 --- a/table/format.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ -#define STORAGE_LEVELDB_TABLE_FORMAT_H_ - -#include -#include -#include "leveldb/slice.h" -#include "leveldb/status.h" -#include "leveldb/table_builder.h" - -namespace leveldb { - -class Block; -class RandomAccessFile; -struct ReadOptions; - -// BlockHandle is a pointer to the extent of a file that stores a data -// block or a meta block. -class BlockHandle { - public: - BlockHandle(); - - // The offset of the block in the file. - uint64_t offset() const { return offset_; } - void set_offset(uint64_t offset) { offset_ = offset; } - - // The size of the stored block - uint64_t size() const { return size_; } - void set_size(uint64_t size) { size_ = size; } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(Slice* input); - - // Maximum encoding length of a BlockHandle - enum { kMaxEncodedLength = 10 + 10 }; - - private: - uint64_t offset_; - uint64_t size_; -}; - -// Footer encapsulates the fixed information stored at the tail -// end of every table file. -class Footer { - public: - Footer() { } - - // The block handle for the metaindex block of the table - const BlockHandle& metaindex_handle() const { return metaindex_handle_; } - void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } - - // The block handle for the index block of the table - const BlockHandle& index_handle() const { - return index_handle_; - } - void set_index_handle(const BlockHandle& h) { - index_handle_ = h; - } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(Slice* input); - - // Encoded length of a Footer. Note that the serialization of a - // Footer will always occupy exactly this many bytes. It consists - // of two block handles and a magic number. - enum { - kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 - }; - - private: - BlockHandle metaindex_handle_; - BlockHandle index_handle_; -}; - -// kTableMagicNumber was picked by running -// echo http://code.google.com/p/leveldb/ | sha1sum -// and taking the leading 64 bits. -static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; - -// 1-byte type + 32-bit crc -static const size_t kBlockTrailerSize = 5; - -// Read the block identified by "handle" from "file". On success, -// store a pointer to the heap-allocated result in *block and return -// OK. On failure store NULL in *block and return non-OK. -extern Status ReadBlock(RandomAccessFile* file, - const ReadOptions& options, - const BlockHandle& handle, - Block** block); - -// Implementation details follow. Clients should ignore, - -inline BlockHandle::BlockHandle() - : offset_(~static_cast(0)), - size_(~static_cast(0)) { -} - -} - -#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ diff --git a/table/iterator.cc b/table/iterator.cc deleted file mode 100644 index 4ddd55f..0000000 --- a/table/iterator.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/iterator.h" -#include "util/logging.h" - -namespace leveldb { - -Iterator::Iterator() { - cleanup_.function = NULL; - cleanup_.next = NULL; -} - -Iterator::~Iterator() { - if (cleanup_.function != NULL) { - (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); - for (Cleanup* c = cleanup_.next; c != NULL; ) { - (*c->function)(c->arg1, c->arg2); - Cleanup* next = c->next; - delete c; - c = next; - } - } -} - -void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { - assert(func != NULL); - Cleanup* c; - if (cleanup_.function == NULL) { - c = &cleanup_; - } else { - c = new Cleanup; - c->next = cleanup_.next; - cleanup_.next = c; - } - c->function = func; - c->arg1 = arg1; - c->arg2 = arg2; -} - -namespace { -class EmptyIterator : public Iterator { - public: - EmptyIterator(const Status& s) : status_(s) { } - virtual bool Valid() const { return false; } - virtual void Seek(const Slice& target) { } - virtual void SeekToFirst() { } - virtual void SeekToLast() { } - virtual void Next() { assert(false); } - virtual void Prev() { assert(false); } - Slice key() const { assert(false); return Slice(); } - Slice value() const { assert(false); return Slice(); } - virtual Status status() const { return status_; } - private: - Status status_; -}; -} - -Iterator* NewEmptyIterator() { - return new EmptyIterator(Status::OK()); -} - -Iterator* NewErrorIterator(const Status& status) { - return new EmptyIterator(status); -} - -} diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h deleted file mode 100644 index 158d3a7..0000000 --- a/table/iterator_wrapper.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ -#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ - -namespace leveldb { - -// A internal wrapper class with an interface similar to Iterator that -// caches the valid() and key() results for an underlying iterator. -// This can help avoid virtual function calls and also gives better -// cache locality. -class IteratorWrapper { - private: - Iterator* iter_; - bool valid_; - Slice key_; - public: - IteratorWrapper(): iter_(NULL), valid_(false) { } - explicit IteratorWrapper(Iterator* iter): iter_(NULL) { - Set(iter); - } - ~IteratorWrapper() { delete iter_; } - Iterator* iter() const { return iter_; } - - // Takes ownership of "iter" and will delete it when destroyed, or - // when Set() is invoked again. - void Set(Iterator* iter) { - delete iter_; - iter_ = iter; - if (iter_ == NULL) { - valid_ = false; - } else { - Update(); - } - } - - - // Iterator interface methods - bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } - Slice value() const { assert(Valid()); return iter_->value(); } - // Methods below require iter() != NULL - Status status() const { assert(iter_); return iter_->status(); } - void Next() { assert(iter_); iter_->Next(); Update(); } - void Prev() { assert(iter_); iter_->Prev(); Update(); } - void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } - void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } - void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } - - private: - void Update() { - valid_ = iter_->Valid(); - if (valid_) { - key_ = iter_->key(); - } - } -}; - -} - - -#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ diff --git a/table/merger.cc b/table/merger.cc deleted file mode 100644 index 6ce06bb..0000000 --- a/table/merger.cc +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/merger.h" - -#include "leveldb/comparator.h" -#include "leveldb/iterator.h" -#include "table/iterator_wrapper.h" - -namespace leveldb { - -namespace { -class MergingIterator : public Iterator { - public: - MergingIterator(const Comparator* comparator, Iterator** children, int n) - : comparator_(comparator), - children_(new IteratorWrapper[n]), - n_(n), - current_(NULL), - direction_(kForward) { - for (int i = 0; i < n; i++) { - children_[i].Set(children[i]); - } - } - - virtual ~MergingIterator() { - delete[] children_; - } - - virtual bool Valid() const { - return (current_ != NULL); - } - - virtual void SeekToFirst() { - for (int i = 0; i < n_; i++) { - children_[i].SeekToFirst(); - } - FindSmallest(); - direction_ = kForward; - } - - virtual void SeekToLast() { - for (int i = 0; i < n_; i++) { - children_[i].SeekToLast(); - } - FindLargest(); - direction_ = kReverse; - } - - virtual void Seek(const Slice& target) { - for (int i = 0; i < n_; i++) { - children_[i].Seek(target); - } - FindSmallest(); - direction_ = kForward; - } - - virtual void Next() { - assert(Valid()); - - // Ensure that all children are positioned after key(). - // If we are moving in the forward direction, it is already - // true for all of the non-current_ children since current_ is - // the smallest child and key() == current_->key(). Otherwise, - // we explicitly position the non-current_ children. - if (direction_ != kForward) { - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child != current_) { - child->Seek(key()); - if (child->Valid() && - comparator_->Compare(key(), child->key()) == 0) { - child->Next(); - } - } - } - direction_ = kForward; - } - - current_->Next(); - FindSmallest(); - } - - virtual void Prev() { - assert(Valid()); - - // Ensure that all children are positioned before key(). - // If we are moving in the reverse direction, it is already - // true for all of the non-current_ children since current_ is - // the largest child and key() == current_->key(). Otherwise, - // we explicitly position the non-current_ children. - if (direction_ != kReverse) { - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child != current_) { - child->Seek(key()); - if (child->Valid()) { - // Child is at first entry >= key(). Step back one to be < key() - child->Prev(); - } else { - // Child has no entries >= key(). Position at last entry. - child->SeekToLast(); - } - } - } - direction_ = kReverse; - } - - current_->Prev(); - FindLargest(); - } - - virtual Slice key() const { - assert(Valid()); - return current_->key(); - } - - virtual Slice value() const { - assert(Valid()); - return current_->value(); - } - - virtual Status status() const { - Status status; - for (int i = 0; i < n_; i++) { - status = children_[i].status(); - if (!status.ok()) { - break; - } - } - return status; - } - - private: - void FindSmallest(); - void FindLargest(); - - // We might want to use a heap in case there are lots of children. - // For now we use a simple array since we expect a very small number - // of children in leveldb. - const Comparator* comparator_; - IteratorWrapper* children_; - int n_; - IteratorWrapper* current_; - - // Which direction is the iterator moving? - enum Direction { - kForward, - kReverse - }; - Direction direction_; -}; - -void MergingIterator::FindSmallest() { - IteratorWrapper* smallest = NULL; - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child->Valid()) { - if (smallest == NULL) { - smallest = child; - } else if (comparator_->Compare(child->key(), smallest->key()) < 0) { - smallest = child; - } - } - } - current_ = smallest; -} - -void MergingIterator::FindLargest() { - IteratorWrapper* largest = NULL; - for (int i = n_-1; i >= 0; i--) { - IteratorWrapper* child = &children_[i]; - if (child->Valid()) { - if (largest == NULL) { - largest = child; - } else if (comparator_->Compare(child->key(), largest->key()) > 0) { - largest = child; - } - } - } - current_ = largest; -} -} - -Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { - assert(n >= 0); - if (n == 0) { - return NewEmptyIterator(); - } else if (n == 1) { - return list[0]; - } else { - return new MergingIterator(cmp, list, n); - } -} - -} diff --git a/table/merger.h b/table/merger.h deleted file mode 100644 index 71d9dc5..0000000 --- a/table/merger.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ -#define STORAGE_LEVELDB_TABLE_MERGER_H_ - -namespace leveldb { - -class Comparator; -class Iterator; - -// Return an iterator that provided the union of the data in -// children[0,n-1]. Takes ownership of the child iterators and -// will delete them when the result iterator is deleted. -// -// The result does no duplicate suppression. I.e., if a particular -// key is present in K child iterators, it will be yielded K times. -// -// REQUIRES: n >= 0 -extern Iterator* NewMergingIterator( - const Comparator* comparator, Iterator** children, int n); - -} - -#endif // STORAGE_LEVELDB_TABLE_MERGER_H_ diff --git a/table/table.cc b/table/table.cc deleted file mode 100644 index 9820753..0000000 --- a/table/table.cc +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table.h" - -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "table/block.h" -#include "table/format.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" - -namespace leveldb { - -struct Table::Rep { - ~Rep() { - delete index_block; - } - - Options options; - Status status; - RandomAccessFile* file; - uint64_t cache_id; - - BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer - Block* index_block; -}; - -Status Table::Open(const Options& options, - RandomAccessFile* file, - uint64_t size, - Table** table) { - *table = NULL; - if (size < Footer::kEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); - } - - char footer_space[Footer::kEncodedLength]; - Slice footer_input; - Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, - &footer_input, footer_space); - if (!s.ok()) return s; - - Footer footer; - s = footer.DecodeFrom(&footer_input); - if (!s.ok()) return s; - - // Read the index block - Block* index_block = NULL; - if (s.ok()) { - s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); - } - - if (s.ok()) { - // We've successfully read the footer and the index block: we're - // ready to serve requests. - Rep* rep = new Table::Rep; - rep->options = options; - rep->file = file; - rep->metaindex_handle = footer.metaindex_handle(); - rep->index_block = index_block; - rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); - *table = new Table(rep); - } else { - if (index_block) delete index_block; - } - - return s; -} - -Table::~Table() { - delete rep_; -} - -static void DeleteBlock(void* arg, void* ignored) { - delete reinterpret_cast(arg); -} - -static void DeleteCachedBlock(const Slice& key, void* value) { - Block* block = reinterpret_cast(value); - delete block; -} - -static void ReleaseBlock(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle); -} - -// Convert an index iterator value (i.e., an encoded BlockHandle) -// into an iterator over the contents of the corresponding block. -Iterator* Table::BlockReader(void* arg, - const ReadOptions& options, - const Slice& index_value) { - Table* table = reinterpret_cast(arg); - Cache* block_cache = table->rep_->options.block_cache; - Block* block = NULL; - Cache::Handle* cache_handle = NULL; - - BlockHandle handle; - Slice input = index_value; - Status s = handle.DecodeFrom(&input); - // We intentionally allow extra stuff in index_value so that we - // can add more features in the future. - - if (s.ok()) { - if (block_cache != NULL) { - char cache_key_buffer[16]; - EncodeFixed64(cache_key_buffer, table->rep_->cache_id); - EncodeFixed64(cache_key_buffer+8, handle.offset()); - Slice key(cache_key_buffer, sizeof(cache_key_buffer)); - cache_handle = block_cache->Lookup(key); - if (cache_handle != NULL) { - block = reinterpret_cast(block_cache->Value(cache_handle)); - } else { - s = ReadBlock(table->rep_->file, options, handle, &block); - if (s.ok() && options.fill_cache) { - cache_handle = block_cache->Insert( - key, block, block->size(), &DeleteCachedBlock); - } - } - } else { - s = ReadBlock(table->rep_->file, options, handle, &block); - } - } - - Iterator* iter; - if (block != NULL) { - iter = block->NewIterator(table->rep_->options.comparator); - if (cache_handle == NULL) { - iter->RegisterCleanup(&DeleteBlock, block, NULL); - } else { - iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); - } - } else { - iter = NewErrorIterator(s); - } - return iter; -} - -Iterator* Table::NewIterator(const ReadOptions& options) const { - return NewTwoLevelIterator( - rep_->index_block->NewIterator(rep_->options.comparator), - &Table::BlockReader, const_cast(this), options); -} - -uint64_t Table::ApproximateOffsetOf(const Slice& key) const { - Iterator* index_iter = - rep_->index_block->NewIterator(rep_->options.comparator); - index_iter->Seek(key); - uint64_t result; - if (index_iter->Valid()) { - BlockHandle handle; - Slice input = index_iter->value(); - Status s = handle.DecodeFrom(&input); - if (s.ok()) { - result = handle.offset(); - } else { - // Strange: we can't decode the block handle in the index block. - // We'll just return the offset of the metaindex block, which is - // close to the whole file size for this case. - result = rep_->metaindex_handle.offset(); - } - } else { - // key is past the last key in the file. Approximate the offset - // by returning the offset of the metaindex block (which is - // right near the end of the file). - result = rep_->metaindex_handle.offset(); - } - delete index_iter; - return result; -} - -} diff --git a/table/table_builder.cc b/table/table_builder.cc deleted file mode 100644 index 7ec7ad2..0000000 --- a/table/table_builder.cc +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table_builder.h" - -#include -#include -#include "leveldb/comparator.h" -#include "leveldb/env.h" -#include "table/block_builder.h" -#include "table/format.h" -#include "util/coding.h" -#include "util/crc32c.h" -#include "util/logging.h" - -namespace leveldb { - -struct TableBuilder::Rep { - Options options; - Options index_block_options; - WritableFile* file; - uint64_t offset; - Status status; - BlockBuilder data_block; - BlockBuilder index_block; - std::string last_key; - int64_t num_entries; - bool closed; // Either Finish() or Abandon() has been called. - - // We do not emit the index entry for a block until we have seen the - // first key for the next data block. This allows us to use shorter - // keys in the index block. For example, consider a block boundary - // between the keys "the quick brown fox" and "the who". We can use - // "the r" as the key for the index block entry since it is >= all - // entries in the first block and < all entries in subsequent - // blocks. - // - // Invariant: r->pending_index_entry is true only if data_block is empty. - bool pending_index_entry; - BlockHandle pending_handle; // Handle to add to index block - - std::string compressed_output; - - Rep(const Options& opt, WritableFile* f) - : options(opt), - index_block_options(opt), - file(f), - offset(0), - data_block(&options), - index_block(&index_block_options), - num_entries(0), - closed(false), - pending_index_entry(false) { - index_block_options.block_restart_interval = 1; - } -}; - -TableBuilder::TableBuilder(const Options& options, WritableFile* file) - : rep_(new Rep(options, file)) { -} - -TableBuilder::~TableBuilder() { - assert(rep_->closed); // Catch errors where caller forgot to call Finish() - delete rep_; -} - -Status TableBuilder::ChangeOptions(const Options& options) { - // Note: if more fields are added to Options, update - // this function to catch changes that should not be allowed to - // change in the middle of building a Table. - if (options.comparator != rep_->options.comparator) { - return Status::InvalidArgument("changing comparator while building table"); - } - - // Note that any live BlockBuilders point to rep_->options and therefore - // will automatically pick up the updated options. - rep_->options = options; - rep_->index_block_options = options; - rep_->index_block_options.block_restart_interval = 1; - return Status::OK(); -} - -void TableBuilder::Add(const Slice& key, const Slice& value) { - Rep* r = rep_; - assert(!r->closed); - if (!ok()) return; - if (r->num_entries > 0) { - assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); - } - - if (r->pending_index_entry) { - assert(r->data_block.empty()); - r->options.comparator->FindShortestSeparator(&r->last_key, key); - std::string handle_encoding; - r->pending_handle.EncodeTo(&handle_encoding); - r->index_block.Add(r->last_key, Slice(handle_encoding)); - r->pending_index_entry = false; - } - - r->last_key.assign(key.data(), key.size()); - r->num_entries++; - r->data_block.Add(key, value); - - const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); - if (estimated_block_size >= r->options.block_size) { - Flush(); - } -} - -void TableBuilder::Flush() { - Rep* r = rep_; - assert(!r->closed); - if (!ok()) return; - if (r->data_block.empty()) return; - assert(!r->pending_index_entry); - WriteBlock(&r->data_block, &r->pending_handle); - if (ok()) { - r->pending_index_entry = true; - r->status = r->file->Flush(); - } -} - -void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { - // File format contains a sequence of blocks where each block has: - // block_data: uint8[n] - // type: uint8 - // crc: uint32 - assert(ok()); - Rep* r = rep_; - Slice raw = block->Finish(); - - Slice block_contents; - CompressionType type = r->options.compression; - // TODO(postrelease): Support more compression options: zlib? - switch (type) { - case kNoCompression: - block_contents = raw; - break; - - case kSnappyCompression: { - std::string* compressed = &r->compressed_output; - if (port::Snappy_Compress(raw.data(), raw.size(), compressed) && - compressed->size() < raw.size() - (raw.size() / 8u)) { - block_contents = *compressed; - } else { - // Snappy not supported, or compressed less than 12.5%, so just - // store uncompressed form - block_contents = raw; - type = kNoCompression; - } - break; - } - } - handle->set_offset(r->offset); - handle->set_size(block_contents.size()); - r->status = r->file->Append(block_contents); - if (r->status.ok()) { - char trailer[kBlockTrailerSize]; - trailer[0] = type; - uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); - crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type - EncodeFixed32(trailer+1, crc32c::Mask(crc)); - r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); - if (r->status.ok()) { - r->offset += block_contents.size() + kBlockTrailerSize; - } - } - r->compressed_output.clear(); - block->Reset(); -} - -Status TableBuilder::status() const { - return rep_->status; -} - -Status TableBuilder::Finish() { - Rep* r = rep_; - Flush(); - assert(!r->closed); - r->closed = true; - BlockHandle metaindex_block_handle; - BlockHandle index_block_handle; - if (ok()) { - BlockBuilder meta_index_block(&r->options); - // TODO(postrelease): Add stats and other meta blocks - WriteBlock(&meta_index_block, &metaindex_block_handle); - } - if (ok()) { - if (r->pending_index_entry) { - r->options.comparator->FindShortSuccessor(&r->last_key); - std::string handle_encoding; - r->pending_handle.EncodeTo(&handle_encoding); - r->index_block.Add(r->last_key, Slice(handle_encoding)); - r->pending_index_entry = false; - } - WriteBlock(&r->index_block, &index_block_handle); - } - if (ok()) { - Footer footer; - footer.set_metaindex_handle(metaindex_block_handle); - footer.set_index_handle(index_block_handle); - std::string footer_encoding; - footer.EncodeTo(&footer_encoding); - r->status = r->file->Append(footer_encoding); - if (r->status.ok()) { - r->offset += footer_encoding.size(); - } - } - return r->status; -} - -void TableBuilder::Abandon() { - Rep* r = rep_; - assert(!r->closed); - r->closed = true; -} - -uint64_t TableBuilder::NumEntries() const { - return rep_->num_entries; -} - -uint64_t TableBuilder::FileSize() const { - return rep_->offset; -} - -} diff --git a/table/table_test.cc b/table/table_test.cc deleted file mode 100644 index 4b3e85e..0000000 --- a/table/table_test.cc +++ /dev/null @@ -1,841 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table.h" - -#include -#include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "leveldb/table_builder.h" -#include "table/block.h" -#include "table/block_builder.h" -#include "table/format.h" -#include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -// Return reverse of "key". -// Used to test non-lexicographic comparators. -static std::string Reverse(const Slice& key) { - std::string str(key.ToString()); - std::string rev(str.rbegin(), str.rend()); - return rev; -} - -namespace { -class ReverseKeyComparator : public Comparator { - public: - virtual const char* Name() const { - return "leveldb.ReverseBytewiseComparator"; - } - - virtual int Compare(const Slice& a, const Slice& b) const { - return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); - } - - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const { - std::string s = Reverse(*start); - std::string l = Reverse(limit); - BytewiseComparator()->FindShortestSeparator(&s, l); - *start = Reverse(s); - } - - virtual void FindShortSuccessor(std::string* key) const { - std::string s = Reverse(*key); - BytewiseComparator()->FindShortSuccessor(&s); - *key = Reverse(s); - } -}; -} -static ReverseKeyComparator reverse_key_comparator; - -static void Increment(const Comparator* cmp, std::string* key) { - if (cmp == BytewiseComparator()) { - key->push_back('\0'); - } else { - assert(cmp == &reverse_key_comparator); - std::string rev = Reverse(*key); - rev.push_back('\0'); - *key = Reverse(rev); - } -} - -// An STL comparator that uses a Comparator -namespace { -struct STLLessThan { - const Comparator* cmp; - - STLLessThan() : cmp(BytewiseComparator()) { } - STLLessThan(const Comparator* c) : cmp(c) { } - bool operator()(const std::string& a, const std::string& b) const { - return cmp->Compare(Slice(a), Slice(b)) < 0; - } -}; -} - -class StringSink: public WritableFile { - public: - ~StringSink() { } - - const std::string& contents() const { return contents_; } - - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } - - virtual Status Append(const Slice& data) { - contents_.append(data.data(), data.size()); - return Status::OK(); - } - - private: - std::string contents_; -}; - - -class StringSource: public RandomAccessFile { - public: - StringSource(const Slice& contents) - : contents_(contents.data(), contents.size()) { - } - - virtual ~StringSource() { } - - uint64_t Size() const { return contents_.size(); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - if (offset > contents_.size()) { - return Status::InvalidArgument("invalid Read offset"); - } - if (offset + n > contents_.size()) { - n = contents_.size() - offset; - } - memcpy(scratch, &contents_[offset], n); - *result = Slice(scratch, n); - return Status::OK(); - } - - private: - std::string contents_; -}; - -typedef std::map KVMap; - -// Helper class for tests to unify the interface between -// BlockBuilder/TableBuilder and Block/Table. -class Constructor { - public: - explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } - virtual ~Constructor() { } - - void Add(const std::string& key, const Slice& value) { - data_[key] = value.ToString(); - } - - // Finish constructing the data structure with all the keys that have - // been added so far. Returns the keys in sorted order in "*keys" - // and stores the key/value pairs in "*kvmap" - void Finish(const Options& options, - std::vector* keys, - KVMap* kvmap) { - *kvmap = data_; - keys->clear(); - for (KVMap::const_iterator it = data_.begin(); - it != data_.end(); - ++it) { - keys->push_back(it->first); - } - data_.clear(); - Status s = FinishImpl(options, *kvmap); - ASSERT_TRUE(s.ok()) << s.ToString(); - } - - // Construct the data structure from the data in "data" - virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; - - virtual size_t NumBytes() const = 0; - - virtual Iterator* NewIterator() const = 0; - - virtual const KVMap& data() { return data_; } - - virtual DB* db() const { return NULL; } // Overridden in DBConstructor - - private: - KVMap data_; -}; - -class BlockConstructor: public Constructor { - public: - explicit BlockConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp), - block_size_(-1), - block_(NULL) { } - ~BlockConstructor() { - delete block_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete block_; - block_ = NULL; - BlockBuilder builder(&options); - - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - builder.Add(it->first, it->second); - } - // Open the block - Slice block_data = builder.Finish(); - block_size_ = block_data.size(); - char* block_data_copy = new char[block_size_]; - memcpy(block_data_copy, block_data.data(), block_size_); - block_ = new Block(block_data_copy, block_size_); - return Status::OK(); - } - virtual size_t NumBytes() const { return block_size_; } - - virtual Iterator* NewIterator() const { - return block_->NewIterator(comparator_); - } - - private: - const Comparator* comparator_; - int block_size_; - Block* block_; - - BlockConstructor(); -}; - -class TableConstructor: public Constructor { - public: - TableConstructor(const Comparator* cmp) - : Constructor(cmp), - source_(NULL), table_(NULL) { - } - ~TableConstructor() { - Reset(); - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - Reset(); - StringSink sink; - TableBuilder builder(options, &sink); - - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - builder.Add(it->first, it->second); - ASSERT_TRUE(builder.status().ok()); - } - Status s = builder.Finish(); - ASSERT_TRUE(s.ok()) << s.ToString(); - - ASSERT_EQ(sink.contents().size(), builder.FileSize()); - - // Open the table - source_ = new StringSource(sink.contents()); - Options table_options; - table_options.comparator = options.comparator; - return Table::Open(table_options, source_, sink.contents().size(), &table_); - } - virtual size_t NumBytes() const { return source_->Size(); } - - virtual Iterator* NewIterator() const { - return table_->NewIterator(ReadOptions()); - } - - uint64_t ApproximateOffsetOf(const Slice& key) const { - return table_->ApproximateOffsetOf(key); - } - - private: - void Reset() { - delete table_; - delete source_; - table_ = NULL; - source_ = NULL; - } - - StringSource* source_; - Table* table_; - - TableConstructor(); -}; - -// A helper class that converts internal format keys into user keys -class KeyConvertingIterator: public Iterator { - public: - explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } - virtual ~KeyConvertingIterator() { delete iter_; } - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& target) { - ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); - std::string encoded; - AppendInternalKey(&encoded, ikey); - iter_->Seek(encoded); - } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } - - virtual Slice key() const { - assert(Valid()); - ParsedInternalKey key; - if (!ParseInternalKey(iter_->key(), &key)) { - status_ = Status::Corruption("malformed internal key"); - return Slice("corrupted key"); - } - return key.user_key; - } - - virtual Slice value() const { return iter_->value(); } - virtual Status status() const { - return status_.ok() ? iter_->status() : status_; - } - - private: - mutable Status status_; - Iterator* iter_; - - // No copying allowed - KeyConvertingIterator(const KeyConvertingIterator&); - void operator=(const KeyConvertingIterator&); -}; - -class MemTableConstructor: public Constructor { - public: - explicit MemTableConstructor(const Comparator* cmp) - : Constructor(cmp), - internal_comparator_(cmp) { - memtable_ = new MemTable(internal_comparator_); - } - ~MemTableConstructor() { - delete memtable_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete memtable_; - memtable_ = new MemTable(internal_comparator_); - int seq = 1; - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - memtable_->Add(seq, kTypeValue, it->first, it->second); - seq++; - } - return Status::OK(); - } - virtual size_t NumBytes() const { - return memtable_->ApproximateMemoryUsage(); - } - - virtual Iterator* NewIterator() const { - return new KeyConvertingIterator(memtable_->NewIterator()); - } - - private: - InternalKeyComparator internal_comparator_; - MemTable* memtable_; -}; - -class DBConstructor: public Constructor { - public: - explicit DBConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp) { - db_ = NULL; - NewDB(); - } - ~DBConstructor() { - delete db_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete db_; - db_ = NULL; - NewDB(); - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - WriteBatch batch; - batch.Put(it->first, it->second); - ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); - } - return Status::OK(); - } - virtual size_t NumBytes() const { - Range r("", "\xff\xff"); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - - virtual Iterator* NewIterator() const { - return db_->NewIterator(ReadOptions()); - } - - virtual DB* db() const { return db_; } - - private: - void NewDB() { - std::string name = test::TmpDir() + "/table_testdb"; - - Options options; - options.comparator = comparator_; - Status status = DestroyDB(name, options); - ASSERT_TRUE(status.ok()) << status.ToString(); - - options.create_if_missing = true; - options.error_if_exists = true; - options.write_buffer_size = 10000; // Something small to force merging - status = DB::Open(options, name, &db_); - ASSERT_TRUE(status.ok()) << status.ToString(); - } - - const Comparator* comparator_; - DB* db_; -}; - -enum TestType { - TABLE_TEST, - BLOCK_TEST, - MEMTABLE_TEST, - DB_TEST, -}; - -struct TestArgs { - TestType type; - bool reverse_compare; - int restart_interval; -}; - -static const TestArgs kTestArgList[] = { - { TABLE_TEST, false, 16 }, - { TABLE_TEST, false, 1 }, - { TABLE_TEST, false, 1024 }, - { TABLE_TEST, true, 16 }, - { TABLE_TEST, true, 1 }, - { TABLE_TEST, true, 1024 }, - - { BLOCK_TEST, false, 16 }, - { BLOCK_TEST, false, 1 }, - { BLOCK_TEST, false, 1024 }, - { BLOCK_TEST, true, 16 }, - { BLOCK_TEST, true, 1 }, - { BLOCK_TEST, true, 1024 }, - - // Restart interval does not matter for memtables - { MEMTABLE_TEST, false, 16 }, - { MEMTABLE_TEST, true, 16 }, - - // Do not bother with restart interval variations for DB - { DB_TEST, false, 16 }, - { DB_TEST, true, 16 }, -}; -static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); - -class Harness { - public: - Harness() : constructor_(NULL) { } - - void Init(const TestArgs& args) { - delete constructor_; - constructor_ = NULL; - options_ = Options(); - - options_.block_restart_interval = args.restart_interval; - // Use shorter block size for tests to exercise block boundary - // conditions more. - options_.block_size = 256; - if (args.reverse_compare) { - options_.comparator = &reverse_key_comparator; - } - switch (args.type) { - case TABLE_TEST: - constructor_ = new TableConstructor(options_.comparator); - break; - case BLOCK_TEST: - constructor_ = new BlockConstructor(options_.comparator); - break; - case MEMTABLE_TEST: - constructor_ = new MemTableConstructor(options_.comparator); - break; - case DB_TEST: - constructor_ = new DBConstructor(options_.comparator); - break; - } - } - - ~Harness() { - delete constructor_; - } - - void Add(const std::string& key, const std::string& value) { - constructor_->Add(key, value); - } - - void Test(Random* rnd) { - std::vector keys; - KVMap data; - constructor_->Finish(options_, &keys, &data); - - TestForwardScan(keys, data); - TestBackwardScan(keys, data); - TestRandomAccess(rnd, keys, data); - } - - void TestForwardScan(const std::vector& keys, - const KVMap& data) { - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - iter->SeekToFirst(); - for (KVMap::const_iterator model_iter = data.begin(); - model_iter != data.end(); - ++model_iter) { - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - iter->Next(); - } - ASSERT_TRUE(!iter->Valid()); - delete iter; - } - - void TestBackwardScan(const std::vector& keys, - const KVMap& data) { - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - iter->SeekToLast(); - for (KVMap::const_reverse_iterator model_iter = data.rbegin(); - model_iter != data.rend(); - ++model_iter) { - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - iter->Prev(); - } - ASSERT_TRUE(!iter->Valid()); - delete iter; - } - - void TestRandomAccess(Random* rnd, - const std::vector& keys, - const KVMap& data) { - static const bool kVerbose = false; - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - KVMap::const_iterator model_iter = data.begin(); - if (kVerbose) fprintf(stderr, "---\n"); - for (int i = 0; i < 200; i++) { - const int toss = rnd->Uniform(5); - switch (toss) { - case 0: { - if (iter->Valid()) { - if (kVerbose) fprintf(stderr, "Next\n"); - iter->Next(); - ++model_iter; - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - } - break; - } - - case 1: { - if (kVerbose) fprintf(stderr, "SeekToFirst\n"); - iter->SeekToFirst(); - model_iter = data.begin(); - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - - case 2: { - std::string key = PickRandomKey(rnd, keys); - model_iter = data.lower_bound(key); - if (kVerbose) fprintf(stderr, "Seek '%s'\n", - EscapeString(key).c_str()); - iter->Seek(Slice(key)); - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - - case 3: { - if (iter->Valid()) { - if (kVerbose) fprintf(stderr, "Prev\n"); - iter->Prev(); - if (model_iter == data.begin()) { - model_iter = data.end(); // Wrap around to invalid value - } else { - --model_iter; - } - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - } - break; - } - - case 4: { - if (kVerbose) fprintf(stderr, "SeekToLast\n"); - iter->SeekToLast(); - if (keys.empty()) { - model_iter = data.end(); - } else { - std::string last = data.rbegin()->first; - model_iter = data.lower_bound(last); - } - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - } - } - delete iter; - } - - std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { - if (it == data.end()) { - return "END"; - } else { - return "'" + it->first + "->" + it->second + "'"; - } - } - - std::string ToString(const KVMap& data, - const KVMap::const_reverse_iterator& it) { - if (it == data.rend()) { - return "END"; - } else { - return "'" + it->first + "->" + it->second + "'"; - } - } - - std::string ToString(const Iterator* it) { - if (!it->Valid()) { - return "END"; - } else { - return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; - } - } - - std::string PickRandomKey(Random* rnd, const std::vector& keys) { - if (keys.empty()) { - return "foo"; - } else { - const int index = rnd->Uniform(keys.size()); - std::string result = keys[index]; - switch (rnd->Uniform(3)) { - case 0: - // Return an existing key - break; - case 1: { - // Attempt to return something smaller than an existing key - if (result.size() > 0 && result[result.size()-1] > '\0') { - result[result.size()-1]--; - } - break; - } - case 2: { - // Return something larger than an existing key - Increment(options_.comparator, &result); - break; - } - } - return result; - } - } - - // Returns NULL if not running against a DB - DB* db() const { return constructor_->db(); } - - private: - Options options_; - Constructor* constructor_; -}; - -// Test the empty key -TEST(Harness, SimpleEmptyKey) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 1); - Add("", "v"); - Test(&rnd); - } -} - -TEST(Harness, SimpleSingle) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 2); - Add("abc", "v"); - Test(&rnd); - } -} - -TEST(Harness, SimpleMulti) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 3); - Add("abc", "v"); - Add("abcd", "v"); - Add("ac", "v2"); - Test(&rnd); - } -} - -TEST(Harness, SimpleSpecialKey) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 4); - Add("\xff\xff", "v3"); - Test(&rnd); - } -} - -TEST(Harness, Randomized) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 5); - for (int num_entries = 0; num_entries < 2000; - num_entries += (num_entries < 50 ? 1 : 200)) { - if ((num_entries % 10) == 0) { - fprintf(stderr, "case %d of %d: num_entries = %d\n", - (i + 1), int(kNumTestArgs), num_entries); - } - for (int e = 0; e < num_entries; e++) { - std::string v; - Add(test::RandomKey(&rnd, rnd.Skewed(4)), - test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); - } - Test(&rnd); - } - } -} - -TEST(Harness, RandomizedLongDB) { - Random rnd(test::RandomSeed()); - TestArgs args = { DB_TEST, false, 16 }; - Init(args); - int num_entries = 100000; - for (int e = 0; e < num_entries; e++) { - std::string v; - Add(test::RandomKey(&rnd, rnd.Skewed(4)), - test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); - } - Test(&rnd); - - // We must have created enough data to force merging - std::string l0_files, l1_files; - ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); - ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); - ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0); - -} - -class MemTableTest { }; - -TEST(MemTableTest, Simple) { - InternalKeyComparator cmp(BytewiseComparator()); - MemTable memtable(cmp); - WriteBatch batch; - WriteBatchInternal::SetSequence(&batch, 100); - batch.Put(std::string("k1"), std::string("v1")); - batch.Put(std::string("k2"), std::string("v2")); - batch.Put(std::string("k3"), std::string("v3")); - batch.Put(std::string("largekey"), std::string("vlarge")); - ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); - - Iterator* iter = memtable.NewIterator(); - iter->SeekToFirst(); - while (iter->Valid()) { - fprintf(stderr, "key: '%s' -> '%s'\n", - iter->key().ToString().c_str(), - iter->value().ToString().c_str()); - iter->Next(); - } - - delete iter; -} - -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; -} - -class TableTest { }; - -TEST(TableTest, ApproximateOffsetOfPlain) { - TableConstructor c(BytewiseComparator()); - c.Add("k01", "hello"); - c.Add("k02", "hello2"); - c.Add("k03", std::string(10000, 'x')); - c.Add("k04", std::string(200000, 'x')); - c.Add("k05", std::string(300000, 'x')); - c.Add("k06", "hello3"); - c.Add("k07", std::string(100000, 'x')); - std::vector keys; - KVMap kvmap; - Options options; - options.block_size = 1024; - options.compression = kNoCompression; - c.Finish(options, &keys, &kvmap); - - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); - -} - -static bool SnappyCompressionSupported() { - std::string out; - Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Snappy_Compress(in.data(), in.size(), &out); -} - -TEST(TableTest, ApproximateOffsetOfCompressed) { - if (!SnappyCompressionSupported()) { - fprintf(stderr, "skipping compression tests\n"); - return; - } - - Random rnd(301); - TableConstructor c(BytewiseComparator()); - std::string tmp; - c.Add("k01", "hello"); - c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); - c.Add("k03", "hello3"); - c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); - std::vector keys; - KVMap kvmap; - Options options; - options.block_size = 1024; - options.compression = kSnappyCompression; - c.Finish(options, &keys, &kvmap); - - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc deleted file mode 100644 index 24a1241..0000000 --- a/table/two_level_iterator.cc +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/two_level_iterator.h" - -#include "leveldb/table.h" -#include "table/block.h" -#include "table/format.h" -#include "table/iterator_wrapper.h" - -namespace leveldb { - -namespace { - -typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); - -class TwoLevelIterator: public Iterator { - public: - TwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options); - - virtual ~TwoLevelIterator(); - - virtual void Seek(const Slice& target); - virtual void SeekToFirst(); - virtual void SeekToLast(); - virtual void Next(); - virtual void Prev(); - - virtual bool Valid() const { - return data_iter_.Valid(); - } - virtual Slice key() const { - assert(Valid()); - return data_iter_.key(); - } - virtual Slice value() const { - assert(Valid()); - return data_iter_.value(); - } - virtual Status status() const { - // It'd be nice if status() returned a const Status& instead of a Status - if (!index_iter_.status().ok()) { - return index_iter_.status(); - } else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { - return data_iter_.status(); - } else { - return status_; - } - } - - private: - void SaveError(const Status& s) { - if (status_.ok() && !s.ok()) status_ = s; - } - void SkipEmptyDataBlocksForward(); - void SkipEmptyDataBlocksBackward(); - void SetDataIterator(Iterator* data_iter); - void InitDataBlock(); - - BlockFunction block_function_; - void* arg_; - const ReadOptions options_; - Status status_; - IteratorWrapper index_iter_; - IteratorWrapper data_iter_; // May be NULL - // If data_iter_ is non-NULL, then "data_block_handle_" holds the - // "index_value" passed to block_function_ to create the data_iter_. - std::string data_block_handle_; -}; - -TwoLevelIterator::TwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options) - : block_function_(block_function), - arg_(arg), - options_(options), - index_iter_(index_iter), - data_iter_(NULL) { -} - -TwoLevelIterator::~TwoLevelIterator() { -} - -void TwoLevelIterator::Seek(const Slice& target) { - index_iter_.Seek(target); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.Seek(target); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::SeekToFirst() { - index_iter_.SeekToFirst(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::SeekToLast() { - index_iter_.SeekToLast(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); - SkipEmptyDataBlocksBackward(); -} - -void TwoLevelIterator::Next() { - assert(Valid()); - data_iter_.Next(); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::Prev() { - assert(Valid()); - data_iter_.Prev(); - SkipEmptyDataBlocksBackward(); -} - - -void TwoLevelIterator::SkipEmptyDataBlocksForward() { - while (data_iter_.iter() == NULL || !data_iter_.Valid()) { - // Move to next block - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - return; - } - index_iter_.Next(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); - } -} - -void TwoLevelIterator::SkipEmptyDataBlocksBackward() { - while (data_iter_.iter() == NULL || !data_iter_.Valid()) { - // Move to next block - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - return; - } - index_iter_.Prev(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); - } -} - -void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { - if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); - data_iter_.Set(data_iter); -} - -void TwoLevelIterator::InitDataBlock() { - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - } else { - Slice handle = index_iter_.value(); - if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { - // data_iter_ is already constructed with this iterator, so - // no need to change anything - } else { - Iterator* iter = (*block_function_)(arg_, options_, handle); - data_block_handle_.assign(handle.data(), handle.size()); - SetDataIterator(iter); - } - } -} - -} - -Iterator* NewTwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options) { - return new TwoLevelIterator(index_iter, block_function, arg, options); -} - -} diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h deleted file mode 100644 index 5909e2b..0000000 --- a/table/two_level_iterator.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ -#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ - -#include "leveldb/iterator.h" - -namespace leveldb { - -struct ReadOptions; - -// Return a new two level iterator. A two-level iterator contains an -// index iterator whose values point to a sequence of blocks where -// each block is itself a sequence of key,value pairs. The returned -// two-level iterator yields the concatenation of all key/value pairs -// in the sequence of blocks. Takes ownership of "index_iter" and -// will delete it when no longer needed. -// -// Uses a supplied function to convert an index_iter value into -// an iterator over the contents of the corresponding block. -extern Iterator* NewTwoLevelIterator( - Iterator* index_iter, - Iterator* (*block_function)( - void* arg, - const ReadOptions& options, - const Slice& index_value), - void* arg, - const ReadOptions& options); - -} - -#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ diff --git a/util/arena.cc b/util/arena.cc deleted file mode 100644 index 4bf6e36..0000000 --- a/util/arena.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/arena.h" -#include - -namespace leveldb { - -static const int kBlockSize = 4096; - -Arena::Arena() { - blocks_memory_ = 0; - alloc_ptr_ = NULL; // First allocation will allocate a block - alloc_bytes_remaining_ = 0; -} - -Arena::~Arena() { - for (int i = 0; i < blocks_.size(); i++) { - delete[] blocks_[i]; - } -} - -char* Arena::AllocateFallback(size_t bytes) { - if (bytes > kBlockSize / 4) { - // Object is more than a quarter of our block size. Allocate it separately - // to avoid wasting too much space in leftover bytes. - char* result = AllocateNewBlock(bytes); - return result; - } - - // We waste the remaining space in the current block. - alloc_ptr_ = AllocateNewBlock(kBlockSize); - alloc_bytes_remaining_ = kBlockSize; - - char* result = alloc_ptr_; - alloc_ptr_ += bytes; - alloc_bytes_remaining_ -= bytes; - return result; -} - -char* Arena::AllocateAligned(size_t bytes) { - const int align = sizeof(void*); // We'll align to pointer size - assert((align & (align-1)) == 0); // Pointer size should be a power of 2 - size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); - size_t slop = (current_mod == 0 ? 0 : align - current_mod); - size_t needed = bytes + slop; - char* result; - if (needed <= alloc_bytes_remaining_) { - result = alloc_ptr_ + slop; - alloc_ptr_ += needed; - alloc_bytes_remaining_ -= needed; - } else { - // AllocateFallback always returned aligned memory - result = AllocateFallback(bytes); - } - assert((reinterpret_cast(result) & (align-1)) == 0); - return result; -} - -char* Arena::AllocateNewBlock(size_t block_bytes) { - char* result = new char[block_bytes]; - blocks_memory_ += block_bytes; - blocks_.push_back(result); - return result; -} - -} diff --git a/util/arena.h b/util/arena.h deleted file mode 100644 index fcb5d5b..0000000 --- a/util/arena.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ -#define STORAGE_LEVELDB_UTIL_ARENA_H_ - -#include -#include -#include -#include - -namespace leveldb { - -class Arena { - public: - Arena(); - ~Arena(); - - // Return a pointer to a newly allocated memory block of "bytes" bytes. - char* Allocate(size_t bytes); - - // Allocate memory with the normal alignment guarantees provided by malloc - char* AllocateAligned(size_t bytes); - - // Returns an estimate of the total memory usage of data allocated - // by the arena (including space allocated but not yet used for user - // allocations). - size_t MemoryUsage() const { - return blocks_memory_ + blocks_.capacity() * sizeof(char*); - } - - private: - char* AllocateFallback(size_t bytes); - char* AllocateNewBlock(size_t block_bytes); - - // Allocation state - char* alloc_ptr_; - size_t alloc_bytes_remaining_; - - // Array of new[] allocated memory blocks - std::vector blocks_; - - // Bytes of memory in blocks allocated so far - size_t blocks_memory_; - - // No copying allowed - Arena(const Arena&); - void operator=(const Arena&); -}; - -inline char* Arena::Allocate(size_t bytes) { - // The semantics of what to return are a bit messy if we allow - // 0-byte allocations, so we disallow them here (we don't need - // them for our internal use). - assert(bytes > 0); - if (bytes <= alloc_bytes_remaining_) { - char* result = alloc_ptr_; - alloc_ptr_ += bytes; - alloc_bytes_remaining_ -= bytes; - return result; - } - return AllocateFallback(bytes); -} - -} - -#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/util/arena_test.cc b/util/arena_test.cc deleted file mode 100644 index c33b552..0000000 --- a/util/arena_test.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/arena.h" - -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { - -class ArenaTest { }; - -TEST(ArenaTest, Empty) { - Arena arena; -} - -TEST(ArenaTest, Simple) { - std::vector > allocated; - Arena arena; - const int N = 100000; - size_t bytes = 0; - Random rnd(301); - for (int i = 0; i < N; i++) { - size_t s; - if (i % (N / 10) == 0) { - s = i; - } else { - s = rnd.OneIn(4000) ? rnd.Uniform(6000) : - (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); - } - if (s == 0) { - // Our arena disallows size 0 allocations. - s = 1; - } - char* r; - if (rnd.OneIn(10)) { - r = arena.AllocateAligned(s); - } else { - r = arena.Allocate(s); - } - - for (int b = 0; b < s; b++) { - // Fill the "i"th allocation with a known bit pattern - r[b] = i % 256; - } - bytes += s; - allocated.push_back(std::make_pair(s, r)); - ASSERT_GE(arena.MemoryUsage(), bytes); - if (i > N/10) { - ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); - } - } - for (int i = 0; i < allocated.size(); i++) { - size_t num_bytes = allocated[i].first; - const char* p = allocated[i].second; - for (int b = 0; b < num_bytes; b++) { - // Check the "i"th allocation for the known bit pattern - ASSERT_EQ(int(p[b]) & 0xff, i % 256); - } - } -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/util/cache.cc b/util/cache.cc deleted file mode 100644 index d8a4426..0000000 --- a/util/cache.cc +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) -#include -#elif defined(LEVELDB_PLATFORM_CHROMIUM) -#include "base/hash_tables.h" -#else -#include // TODO(sanjay): Switch to unordered_set when possible. -#endif - -#include - -#include "leveldb/cache.h" -#include "port/port.h" -#include "util/hash.h" -#include "util/mutexlock.h" - -namespace leveldb { - -Cache::~Cache() { -} - -namespace { - -// LRU cache implementation - -// An entry is a variable length heap-allocated structure. Entries -// are kept in a circular doubly linked list ordered by access time. -struct LRUHandle { - void* value; - void (*deleter)(const Slice&, void* value); - LRUHandle* next; - LRUHandle* prev; - size_t charge; // TODO(opt): Only allow uint32_t? - size_t key_length; - size_t refs; // TODO(opt): Pack with "key_length"? - char key_data[1]; // Beginning of key - - Slice key() const { - // For cheaper lookups, we allow a temporary Handle object - // to store a pointer to a key in "value". - if (next == this) { - return *(reinterpret_cast(value)); - } else { - return Slice(key_data, key_length); - } - } -}; - -// Pick a platform specific hash_set instantiation -#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) - // Microsoft's hash_set deviates from the standard. See - // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx - // for details. Basically the 2 param () operator is a less than and - // the 1 param () operator is a hash function. - struct HandleHashCompare : public stdext::hash_compare { - size_t operator() (LRUHandle* h) const { - Slice k = h->key(); - return Hash(k.data(), k.size(), 0); - } - bool operator() (LRUHandle* a, LRUHandle* b) const { - return a->key().compare(b->key()) < 0; - } - }; - typedef base::hash_set HandleTable; -#else - struct HandleHash { - inline size_t operator()(LRUHandle* h) const { - Slice k = h->key(); - return Hash(k.data(), k.size(), 0); - } - }; - - struct HandleEq { - inline bool operator()(LRUHandle* a, LRUHandle* b) const { - return a->key() == b->key(); - } - }; -# if defined(LEVELDB_PLATFORM_CHROMIUM) - typedef base::hash_set HandleTable; -# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) - typedef std::unordered_set HandleTable; -# else - typedef __gnu_cxx::hash_set HandleTable; -# endif -#endif - -class LRUCache : public Cache { - public: - explicit LRUCache(size_t capacity); - virtual ~LRUCache(); - - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)); - virtual Handle* Lookup(const Slice& key); - virtual void Release(Handle* handle); - virtual void* Value(Handle* handle); - virtual void Erase(const Slice& key); - virtual uint64_t NewId(); - - private: - void LRU_Remove(LRUHandle* e); - void LRU_Append(LRUHandle* e); - void Unref(LRUHandle* e); - - // Constructor parameters - const size_t capacity_; - - // mutex_ protects the following state. - port::Mutex mutex_; - size_t usage_; - uint64_t last_id_; - - // Dummy head of LRU list. - // lru.prev is newest entry, lru.next is oldest entry. - LRUHandle lru_; - - HandleTable table_; -}; - -LRUCache::LRUCache(size_t capacity) - : capacity_(capacity), - usage_(0), - last_id_(0) { - // Make empty circular linked list - lru_.next = &lru_; - lru_.prev = &lru_; -} - -LRUCache::~LRUCache() { - table_.clear(); - for (LRUHandle* e = lru_.next; e != &lru_; ) { - LRUHandle* next = e->next; - assert(e->refs == 1); // Error if caller has an unreleased handle - Unref(e); - e = next; - } -} - -void LRUCache::Unref(LRUHandle* e) { - assert(e->refs > 0); - e->refs--; - if (e->refs <= 0) { - usage_ -= e->charge; - (*e->deleter)(e->key(), e->value); - free(e); - } -} - -void LRUCache::LRU_Remove(LRUHandle* e) { - e->next->prev = e->prev; - e->prev->next = e->next; -} - -void LRUCache::LRU_Append(LRUHandle* e) { - // Make "e" newest entry by inserting just before lru_ - e->next = &lru_; - e->prev = lru_.prev; - e->prev->next = e; - e->next->prev = e; -} - -Cache::Handle* LRUCache::Lookup(const Slice& key) { - MutexLock l(&mutex_); - - LRUHandle dummy; - dummy.next = &dummy; - dummy.value = const_cast(&key); - HandleTable::iterator iter = table_.find(&dummy); - if (iter == table_.end()) { - return NULL; - } else { - LRUHandle* e = const_cast(*iter); - e->refs++; - LRU_Remove(e); - LRU_Append(e); - return reinterpret_cast(e); - } -} - -void* LRUCache::Value(Handle* handle) { - return reinterpret_cast(handle)->value; -} - -void LRUCache::Release(Handle* handle) { - MutexLock l(&mutex_); - Unref(reinterpret_cast(handle)); -} - -Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) { - MutexLock l(&mutex_); - - LRUHandle* e = reinterpret_cast( - malloc(sizeof(LRUHandle)-1 + key.size())); - e->value = value; - e->deleter = deleter; - e->charge = charge; - e->key_length = key.size(); - e->refs = 2; // One from LRUCache, one for the returned handle - memcpy(e->key_data, key.data(), key.size()); - LRU_Append(e); - usage_ += charge; - - std::pair p = table_.insert(e); - if (!p.second) { - // Kill existing entry - LRUHandle* old = const_cast(*(p.first)); - LRU_Remove(old); - table_.erase(p.first); - table_.insert(e); - Unref(old); - } - - while (usage_ > capacity_ && lru_.next != &lru_) { - LRUHandle* old = lru_.next; - LRU_Remove(old); - table_.erase(old); - Unref(old); - } - - return reinterpret_cast(e); -} - -void LRUCache::Erase(const Slice& key) { - MutexLock l(&mutex_); - - LRUHandle dummy; - dummy.next = &dummy; - dummy.value = const_cast(&key); - HandleTable::iterator iter = table_.find(&dummy); - if (iter != table_.end()) { - LRUHandle* e = const_cast(*iter); - LRU_Remove(e); - table_.erase(iter); - Unref(e); - } -} - -uint64_t LRUCache::NewId() { - MutexLock l(&mutex_); - return ++(last_id_); -} - -} // end anonymous namespace - -Cache* NewLRUCache(size_t capacity) { - return new LRUCache(capacity); -} - -} diff --git a/util/cache_test.cc b/util/cache_test.cc deleted file mode 100644 index dbab988..0000000 --- a/util/cache_test.cc +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/cache.h" - -#include -#include "util/coding.h" -#include "util/testharness.h" - -namespace leveldb { - -// Conversions between numeric keys/values and the types expected by Cache. -static std::string EncodeKey(int k) { - std::string result; - PutFixed32(&result, k); - return result; -} -static int DecodeKey(const Slice& k) { - assert(k.size() == 4); - return DecodeFixed32(k.data()); -} -static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } -static int DecodeValue(void* v) { return reinterpret_cast(v); } - -class CacheTest { - public: - static CacheTest* current_; - - static void Deleter(const Slice& key, void* v) { - current_->deleted_keys_.push_back(DecodeKey(key)); - current_->deleted_values_.push_back(DecodeValue(v)); - } - - static const int kCacheSize = 100; - std::vector deleted_keys_; - std::vector deleted_values_; - Cache* cache_; - - CacheTest() : cache_(NewLRUCache(kCacheSize)) { - current_ = this; - } - - ~CacheTest() { - delete cache_; - } - - int Lookup(int key) { - Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); - const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); - if (handle != NULL) { - cache_->Release(handle); - } - return r; - } - - void Insert(int key, int value, int charge = 1) { - cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, - &CacheTest::Deleter)); - } - - void Erase(int key) { - cache_->Erase(EncodeKey(key)); - } -}; -CacheTest* CacheTest::current_; - -TEST(CacheTest, HitAndMiss) { - ASSERT_EQ(-1, Lookup(100)); - - Insert(100, 101); - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(-1, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - Insert(200, 201); - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - Insert(100, 102); - ASSERT_EQ(102, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); -} - -TEST(CacheTest, Erase) { - Erase(200); - ASSERT_EQ(0, deleted_keys_.size()); - - Insert(100, 101); - Insert(200, 201); - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); - - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(1, deleted_keys_.size()); -} - -TEST(CacheTest, EntriesArePinned) { - Insert(100, 101); - Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); - ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); - - Insert(100, 102); - Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); - ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); - ASSERT_EQ(0, deleted_keys_.size()); - - cache_->Release(h1); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); - - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(1, deleted_keys_.size()); - - cache_->Release(h2); - ASSERT_EQ(2, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[1]); - ASSERT_EQ(102, deleted_values_[1]); -} - -TEST(CacheTest, EvictionPolicy) { - Insert(100, 101); - Insert(200, 201); - - // Frequently used entry must be kept around - for (int i = 0; i < kCacheSize; i++) { - Insert(1000+i, 2000+i); - ASSERT_EQ(2000+i, Lookup(1000+i)); - ASSERT_EQ(101, Lookup(100)); - } - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(2, deleted_keys_.size()); - ASSERT_EQ(200, deleted_keys_[0]); - ASSERT_EQ(201, deleted_values_[0]); -} - -TEST(CacheTest, HeavyEntry) { - Insert(100, 101); - Insert(200, 201, kCacheSize); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); -} - -TEST(CacheTest, NewId) { - uint64_t a = cache_->NewId(); - uint64_t b = cache_->NewId(); - ASSERT_NE(a, b); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/util/coding.cc b/util/coding.cc deleted file mode 100644 index 680e2ad..0000000 --- a/util/coding.cc +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/coding.h" - -namespace leveldb { - -void EncodeFixed32(char* buf, uint32_t value) { -#if __BYTE_ORDER == __LITTLE_ENDIAN - memcpy(buf, &value, sizeof(value)); -#else - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; -#endif -} - -void EncodeFixed64(char* buf, uint64_t value) { -#if __BYTE_ORDER == __LITTLE_ENDIAN - memcpy(buf, &value, sizeof(value)); -#else - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; - buf[4] = (value >> 32) & 0xff; - buf[5] = (value >> 40) & 0xff; - buf[6] = (value >> 48) & 0xff; - buf[7] = (value >> 56) & 0xff; -#endif -} - -void PutFixed32(std::string* dst, uint32_t value) { - char buf[sizeof(value)]; - EncodeFixed32(buf, value); - dst->append(buf, sizeof(buf)); -} - -void PutFixed64(std::string* dst, uint64_t value) { - char buf[sizeof(value)]; - EncodeFixed64(buf, value); - dst->append(buf, sizeof(buf)); -} - -char* EncodeVarint32(char* dst, uint32_t v) { - // Operate on characters as unsigneds - unsigned char* ptr = reinterpret_cast(dst); - static const int B = 128; - if (v < (1<<7)) { - *(ptr++) = v; - } else if (v < (1<<14)) { - *(ptr++) = v | B; - *(ptr++) = v>>7; - } else if (v < (1<<21)) { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = v>>14; - } else if (v < (1<<28)) { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = (v>>14) | B; - *(ptr++) = v>>21; - } else { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = (v>>14) | B; - *(ptr++) = (v>>21) | B; - *(ptr++) = v>>28; - } - return reinterpret_cast(ptr); -} - -void PutVarint32(std::string* dst, uint32_t v) { - char buf[5]; - char* ptr = EncodeVarint32(buf, v); - dst->append(buf, ptr - buf); -} - -char* EncodeVarint64(char* dst, uint64_t v) { - static const int B = 128; - unsigned char* ptr = reinterpret_cast(dst); - while (v >= B) { - *(ptr++) = (v & (B-1)) | B; - v >>= 7; - } - *(ptr++) = v; - return reinterpret_cast(ptr); -} - -void PutVarint64(std::string* dst, uint64_t v) { - char buf[10]; - char* ptr = EncodeVarint64(buf, v); - dst->append(buf, ptr - buf); -} - -void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { - PutVarint32(dst, value.size()); - dst->append(value.data(), value.size()); -} - -int VarintLength(uint64_t v) { - int len = 1; - while (v >= 128) { - v >>= 7; - len++; - } - return len; -} - -const char* GetVarint32PtrFallback(const char* p, - const char* limit, - uint32_t* value) { - uint32_t result = 0; - for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { - uint32_t byte = *(reinterpret_cast(p)); - p++; - if (byte & 128) { - // More bytes are present - result |= ((byte & 127) << shift); - } else { - result |= (byte << shift); - *value = result; - return reinterpret_cast(p); - } - } - return NULL; -} - -bool GetVarint32(Slice* input, uint32_t* value) { - const char* p = input->data(); - const char* limit = p + input->size(); - const char* q = GetVarint32Ptr(p, limit, value); - if (q == NULL) { - return false; - } else { - *input = Slice(q, limit - q); - return true; - } -} - -const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { - uint64_t result = 0; - for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { - uint64_t byte = *(reinterpret_cast(p)); - p++; - if (byte & 128) { - // More bytes are present - result |= ((byte & 127) << shift); - } else { - result |= (byte << shift); - *value = result; - return reinterpret_cast(p); - } - } - return NULL; -} - -bool GetVarint64(Slice* input, uint64_t* value) { - const char* p = input->data(); - const char* limit = p + input->size(); - const char* q = GetVarint64Ptr(p, limit, value); - if (q == NULL) { - return false; - } else { - *input = Slice(q, limit - q); - return true; - } -} - -const char* GetLengthPrefixedSlice(const char* p, const char* limit, - Slice* result) { - uint32_t len; - p = GetVarint32Ptr(p, limit, &len); - if (p == NULL) return NULL; - if (p + len > limit) return NULL; - *result = Slice(p, len); - return p + len; -} - -bool GetLengthPrefixedSlice(Slice* input, Slice* result) { - uint32_t len; - if (GetVarint32(input, &len) && - input->size() >= len) { - *result = Slice(input->data(), len); - input->remove_prefix(len); - return true; - } else { - return false; - } -} - -} diff --git a/util/coding.h b/util/coding.h deleted file mode 100644 index 8755968..0000000 --- a/util/coding.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Endian-neutral encoding: -// * Fixed-length numbers are encoded with least-significant byte first -// * In addition we support variable length "varint" encoding -// * Strings are encoded prefixed by their length in varint format - -#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ -#define STORAGE_LEVELDB_UTIL_CODING_H_ - -#include -#include -#include -#include "leveldb/slice.h" -#include "port/port.h" - -namespace leveldb { - -// Standard Put... routines append to a string -extern void PutFixed32(std::string* dst, uint32_t value); -extern void PutFixed64(std::string* dst, uint64_t value); -extern void PutVarint32(std::string* dst, uint32_t value); -extern void PutVarint64(std::string* dst, uint64_t value); -extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); - -// Standard Get... routines parse a value from the beginning of a Slice -// and advance the slice past the parsed value. -extern bool GetVarint32(Slice* input, uint32_t* value); -extern bool GetVarint64(Slice* input, uint64_t* value); -extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); - -// Pointer-based variants of GetVarint... These either store a value -// in *v and return a pointer just past the parsed value, or return -// NULL on error. These routines only look at bytes in the range -// [p..limit-1] -extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); -extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); - -// Returns the length of the varint32 or varint64 encoding of "v" -extern int VarintLength(uint64_t v); - -// Lower-level versions of Put... that write directly into a character buffer -// REQUIRES: dst has enough space for the value being written -extern void EncodeFixed32(char* dst, uint32_t value); -extern void EncodeFixed64(char* dst, uint64_t value); - -// Lower-level versions of Put... that write directly into a character buffer -// and return a pointer just past the last byte written. -// REQUIRES: dst has enough space for the value being written -extern char* EncodeVarint32(char* dst, uint32_t value); -extern char* EncodeVarint64(char* dst, uint64_t value); - -// Lower-level versions of Get... that read directly from a character buffer -// without any bounds checking. - -inline uint32_t DecodeFixed32(const char* ptr) { - if (port::kLittleEndian) { - // Load the raw bytes - uint32_t result; - memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load - return result; - } else { - return ((static_cast(ptr[0])) - | (static_cast(ptr[1]) << 8) - | (static_cast(ptr[2]) << 16) - | (static_cast(ptr[3]) << 24)); - } -} - -inline uint64_t DecodeFixed64(const char* ptr) { - if (port::kLittleEndian) { - // Load the raw bytes - uint64_t result; - memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load - return result; - } else { - uint64_t lo = DecodeFixed32(ptr); - uint64_t hi = DecodeFixed32(ptr + 4); - return (hi << 32) | lo; - } -} - -// Internal routine for use by fallback path of GetVarint32Ptr -extern const char* GetVarint32PtrFallback(const char* p, - const char* limit, - uint32_t* value); -inline const char* GetVarint32Ptr(const char* p, - const char* limit, - uint32_t* value) { - if (p < limit) { - uint32_t result = *(reinterpret_cast(p)); - if ((result & 128) == 0) { - *value = result; - return p + 1; - } - } - return GetVarint32PtrFallback(p, limit, value); -} - -} - -#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/util/coding_test.cc b/util/coding_test.cc deleted file mode 100644 index a8dba04..0000000 --- a/util/coding_test.cc +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/coding.h" - -#include "util/testharness.h" - -namespace leveldb { - -class Coding { }; - -TEST(Coding, Fixed32) { - std::string s; - for (uint32_t v = 0; v < 100000; v++) { - PutFixed32(&s, v); - } - - const char* p = s.data(); - for (uint32_t v = 0; v < 100000; v++) { - uint32_t actual = DecodeFixed32(p); - ASSERT_EQ(v, actual); - p += sizeof(uint32_t); - } -} - -TEST(Coding, Fixed64) { - std::string s; - for (int power = 0; power <= 63; power++) { - uint64_t v = static_cast(1) << power; - PutFixed64(&s, v - 1); - PutFixed64(&s, v + 0); - PutFixed64(&s, v + 1); - } - - const char* p = s.data(); - for (int power = 0; power <= 63; power++) { - uint64_t v = static_cast(1) << power; - uint64_t actual; - actual = DecodeFixed64(p); - ASSERT_EQ(v-1, actual); - p += sizeof(uint64_t); - - actual = DecodeFixed64(p); - ASSERT_EQ(v+0, actual); - p += sizeof(uint64_t); - - actual = DecodeFixed64(p); - ASSERT_EQ(v+1, actual); - p += sizeof(uint64_t); - } -} - -TEST(Coding, Varint32) { - std::string s; - for (uint32_t i = 0; i < (32 * 32); i++) { - uint32_t v = (i / 32) << (i % 32); - PutVarint32(&s, v); - } - - const char* p = s.data(); - const char* limit = p + s.size(); - for (uint32_t i = 0; i < (32 * 32); i++) { - uint32_t expected = (i / 32) << (i % 32); - uint32_t actual; - const char* start = p; - p = GetVarint32Ptr(p, limit, &actual); - ASSERT_TRUE(p != NULL); - ASSERT_EQ(expected, actual); - ASSERT_EQ(VarintLength(actual), p - start); - } - ASSERT_EQ(p, s.data() + s.size()); -} - -TEST(Coding, Varint64) { - // Construct the list of values to check - std::vector values; - // Some special values - values.push_back(0); - values.push_back(100); - values.push_back(~static_cast(0)); - values.push_back(~static_cast(0) - 1); - for (uint32_t k = 0; k < 64; k++) { - // Test values near powers of two - const uint64_t power = 1ull << k; - values.push_back(power); - values.push_back(power-1); - values.push_back(power+1); - }; - - std::string s; - for (int i = 0; i < values.size(); i++) { - PutVarint64(&s, values[i]); - } - - const char* p = s.data(); - const char* limit = p + s.size(); - for (int i = 0; i < values.size(); i++) { - ASSERT_TRUE(p < limit); - uint64_t actual; - const char* start = p; - p = GetVarint64Ptr(p, limit, &actual); - ASSERT_TRUE(p != NULL); - ASSERT_EQ(values[i], actual); - ASSERT_EQ(VarintLength(actual), p - start); - } - ASSERT_EQ(p, limit); - -} - -TEST(Coding, Varint32Overflow) { - uint32_t result; - std::string input("\x81\x82\x83\x84\x85\x11"); - ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) - == NULL); -} - -TEST(Coding, Varint32Truncation) { - uint32_t large_value = (1u << 31) + 100; - std::string s; - PutVarint32(&s, large_value); - uint32_t result; - for (int len = 0; len < s.size() - 1; len++) { - ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); - } - ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); - ASSERT_EQ(large_value, result); -} - -TEST(Coding, Varint64Overflow) { - uint64_t result; - std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); - ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) - == NULL); -} - -TEST(Coding, Varint64Truncation) { - uint64_t large_value = (1ull << 63) + 100ull; - std::string s; - PutVarint64(&s, large_value); - uint64_t result; - for (int len = 0; len < s.size() - 1; len++) { - ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); - } - ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); - ASSERT_EQ(large_value, result); -} - -TEST(Coding, Strings) { - std::string s; - PutLengthPrefixedSlice(&s, Slice("")); - PutLengthPrefixedSlice(&s, Slice("foo")); - PutLengthPrefixedSlice(&s, Slice("bar")); - PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); - - Slice input(s); - Slice v; - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("foo", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("bar", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ(std::string(200, 'x'), v.ToString()); - ASSERT_EQ("", input.ToString()); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/util/comparator.cc b/util/comparator.cc deleted file mode 100644 index e2b27e3..0000000 --- a/util/comparator.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "leveldb/comparator.h" -#include "leveldb/slice.h" -#include "util/logging.h" - -namespace leveldb { - -Comparator::~Comparator() { } - -namespace { -class BytewiseComparatorImpl : public Comparator { - public: - BytewiseComparatorImpl() { } - - virtual const char* Name() const { - return "leveldb.BytewiseComparator"; - } - - virtual int Compare(const Slice& a, const Slice& b) const { - return a.compare(b); - } - - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const { - // Find length of common prefix - size_t min_length = std::min(start->size(), limit.size()); - size_t diff_index = 0; - while ((diff_index < min_length) && - ((*start)[diff_index] == limit[diff_index])) { - diff_index++; - } - - if (diff_index >= min_length) { - // Do not shorten if one string is a prefix of the other - } else { - uint8_t diff_byte = static_cast((*start)[diff_index]); - if (diff_byte < static_cast(0xff) && - diff_byte + 1 < static_cast(limit[diff_index])) { - (*start)[diff_index]++; - start->resize(diff_index + 1); - assert(Compare(*start, limit) < 0); - } - } - } - - virtual void FindShortSuccessor(std::string* key) const { - // Find first character that can be incremented - size_t n = key->size(); - for (int i = 0; i < n; i++) { - const uint8_t byte = (*key)[i]; - if (byte != static_cast(0xff)) { - (*key)[i] = byte + 1; - key->resize(i+1); - return; - } - } - // *key is a run of 0xffs. Leave it alone. - } -}; -} -static const BytewiseComparatorImpl bytewise; - -const Comparator* BytewiseComparator() { - return &bytewise; -} - -} diff --git a/util/crc32c.cc b/util/crc32c.cc deleted file mode 100644 index 28c2401..0000000 --- a/util/crc32c.cc +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A portable implementation of crc32c, optimized to handle -// four bytes at a time. - -#include "util/crc32c.h" - -#include -#include "util/coding.h" - -namespace leveldb { -namespace crc32c { - -static const uint32_t table0_[256] = { - 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, - 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, - 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, - 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, - 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, - 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, - 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, - 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, - 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, - 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, - 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, - 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, - 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, - 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, - 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, - 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, - 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, - 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, - 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, - 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, - 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, - 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, - 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, - 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, - 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, - 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, - 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, - 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, - 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, - 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, - 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, - 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, - 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, - 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, - 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, - 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, - 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, - 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, - 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, - 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, - 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, - 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, - 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, - 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, - 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, - 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, - 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, - 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, - 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, - 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, - 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, - 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, - 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, - 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, - 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, - 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, - 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, - 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, - 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, - 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, - 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, - 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, - 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, - 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 -}; -static const uint32_t table1_[256] = { - 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, - 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, - 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, - 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, - 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, - 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, - 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, - 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, - 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, - 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, - 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, - 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, - 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, - 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, - 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, - 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, - 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, - 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, - 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, - 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, - 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, - 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, - 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, - 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, - 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, - 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, - 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, - 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, - 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, - 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, - 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, - 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, - 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, - 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, - 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, - 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, - 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, - 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, - 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, - 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, - 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, - 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, - 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, - 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, - 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, - 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, - 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, - 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, - 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, - 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, - 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, - 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, - 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, - 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, - 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, - 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, - 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, - 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, - 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, - 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, - 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, - 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, - 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, - 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 -}; -static const uint32_t table2_[256] = { - 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, - 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, - 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, - 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, - 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, - 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, - 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, - 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, - 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, - 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, - 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, - 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, - 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, - 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, - 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, - 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, - 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, - 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, - 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, - 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, - 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, - 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, - 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, - 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, - 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, - 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, - 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, - 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, - 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, - 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, - 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, - 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, - 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, - 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, - 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, - 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, - 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, - 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, - 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, - 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, - 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, - 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, - 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, - 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, - 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, - 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, - 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, - 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, - 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, - 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, - 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, - 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, - 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, - 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, - 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, - 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, - 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, - 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, - 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, - 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, - 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, - 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, - 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, - 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 -}; -static const uint32_t table3_[256] = { - 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, - 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, - 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, - 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, - 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, - 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, - 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, - 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, - 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, - 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, - 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, - 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, - 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, - 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, - 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, - 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, - 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, - 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, - 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, - 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, - 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, - 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, - 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, - 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, - 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, - 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, - 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, - 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, - 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, - 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, - 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, - 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, - 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, - 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, - 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, - 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, - 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, - 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, - 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, - 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, - 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, - 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, - 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, - 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, - 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, - 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, - 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, - 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, - 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, - 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, - 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, - 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, - 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, - 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, - 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, - 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, - 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, - 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, - 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, - 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, - 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, - 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, - 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, - 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 -}; - -// Used to fetch a naturally-aligned 32-bit word in little endian byte-order -static inline uint32_t LE_LOAD32(const uint8_t *p) { - return DecodeFixed32(reinterpret_cast(p)); -} - -uint32_t Extend(uint32_t crc, const char* buf, size_t size) { - const uint8_t *p = reinterpret_cast(buf); - const uint8_t *e = p + size; - uint32_t l = crc ^ 0xffffffffu; - -#define STEP1 do { \ - int c = (l & 0xff) ^ *p++; \ - l = table0_[c] ^ (l >> 8); \ -} while (0) -#define STEP4 do { \ - uint32_t c = l ^ LE_LOAD32(p); \ - p += 4; \ - l = table3_[c & 0xff] ^ \ - table2_[(c >> 8) & 0xff] ^ \ - table1_[(c >> 16) & 0xff] ^ \ - table0_[c >> 24]; \ -} while (0) - - // Point x at first 4-byte aligned byte in string. This might be - // just past the end of the string. - const uintptr_t pval = reinterpret_cast(p); - const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); - if (x <= e) { - // Process bytes until finished or p is 4-byte aligned - while (p != x) { - STEP1; - } - } - // Process bytes 16 at a time - while ((e-p) >= 16) { - STEP4; STEP4; STEP4; STEP4; - } - // Process bytes 4 at a time - while ((e-p) >= 4) { - STEP4; - } - // Process the last few bytes - while (p != e) { - STEP1; - } -#undef STEP4 -#undef STEP1 - return l ^ 0xffffffffu; -} - -} -} diff --git a/util/crc32c.h b/util/crc32c.h deleted file mode 100644 index 938d8ff..0000000 --- a/util/crc32c.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ -#define STORAGE_LEVELDB_UTIL_CRC32C_H_ - -#include -#include - -namespace leveldb { -namespace crc32c { - -// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the -// crc32c of some string A. Extend() is often used to maintain the -// crc32c of a stream of data. -extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); - -// Return the crc32c of data[0,n-1] -inline uint32_t Value(const char* data, size_t n) { - return Extend(0, data, n); -} - -static const uint32_t kMaskDelta = 0xa282ead8ul; - -// Return a masked representation of crc. -// -// Motivation: it is problematic to compute the CRC of a string that -// contains embedded CRCs. Therefore we recommend that CRCs stored -// somewhere (e.g., in files) should be masked before being stored. -inline uint32_t Mask(uint32_t crc) { - // Rotate right by 15 bits and add a constant. - return ((crc >> 15) | (crc << 17)) + kMaskDelta; -} - -// Return the crc whose masked representation is masked_crc. -inline uint32_t Unmask(uint32_t masked_crc) { - uint32_t rot = masked_crc - kMaskDelta; - return ((rot >> 17) | (rot << 15)); -} - -} -} - -#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc deleted file mode 100644 index ba9e804..0000000 --- a/util/crc32c_test.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/crc32c.h" -#include "util/testharness.h" - -namespace leveldb { -namespace crc32c { - -class CRC { }; - -TEST(CRC, StandardResults) { - // From rfc3720 section B.4. - char buf[32]; - - memset(buf, 0, sizeof(buf)); - ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); - - memset(buf, 0xff, sizeof(buf)); - ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); - - for (int i = 0; i < 32; i++) { - buf[i] = i; - } - ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); - - for (int i = 0; i < 32; i++) { - buf[i] = 31 - i; - } - ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); - - unsigned char data[48] = { - 0x01, 0xc0, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x04, 0x00, - 0x00, 0x00, 0x00, 0x14, - 0x00, 0x00, 0x00, 0x18, - 0x28, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - }; - ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); -} - -TEST(CRC, Values) { - ASSERT_NE(Value("a", 1), Value("foo", 3)); -} - -TEST(CRC, Extend) { - ASSERT_EQ(Value("hello world", 11), - Extend(Value("hello ", 6), "world", 5)); -} - -TEST(CRC, Mask) { - uint32_t crc = Value("foo", 3); - ASSERT_NE(crc, Mask(crc)); - ASSERT_NE(crc, Mask(Mask(crc))); - ASSERT_EQ(crc, Unmask(Mask(crc))); - ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); -} - -} -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/util/env.cc b/util/env.cc deleted file mode 100644 index e5297e7..0000000 --- a/util/env.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/env.h" - -namespace leveldb { - -Env::~Env() { -} - -SequentialFile::~SequentialFile() { -} - -RandomAccessFile::~RandomAccessFile() { -} - -WritableFile::~WritableFile() { -} - -FileLock::~FileLock() { -} - -void Log(Env* env, WritableFile* info_log, const char* format, ...) { - va_list ap; - va_start(ap, format); - env->Logv(info_log, format, ap); - va_end(ap); -} - -Status WriteStringToFile(Env* env, const Slice& data, - const std::string& fname) { - WritableFile* file; - Status s = env->NewWritableFile(fname, &file); - if (!s.ok()) { - return s; - } - s = file->Append(data); - if (s.ok()) { - s = file->Close(); - } - delete file; // Will auto-close if we did not close above - if (!s.ok()) { - env->DeleteFile(fname); - } - return s; -} - -Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { - data->clear(); - SequentialFile* file; - Status s = env->NewSequentialFile(fname, &file); - if (!s.ok()) { - return s; - } - static const int kBufferSize = 8192; - char* space = new char[kBufferSize]; - while (true) { - Slice fragment; - s = file->Read(kBufferSize, &fragment, space); - if (!s.ok()) { - break; - } - data->append(fragment.data(), fragment.size()); - if (fragment.empty()) { - break; - } - } - delete[] space; - delete file; - return s; -} - -EnvWrapper::~EnvWrapper() { -} - -} diff --git a/util/env_chromium.cc b/util/env_chromium.cc deleted file mode 100644 index 7edc7a9..0000000 --- a/util/env_chromium.cc +++ /dev/null @@ -1,603 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include "base/at_exit.h" -#include "base/file_path.h" -#include "base/file_util.h" -#include "base/lazy_instance.h" -#include "base/memory/ref_counted.h" -#include "base/message_loop.h" -#include "base/platform_file.h" -#include "base/process_util.h" -#include "base/synchronization/lock.h" -#include "base/sys_info.h" -#include "base/task.h" -#include "base/threading/platform_thread.h" -#include "base/threading/thread.h" -#include "base/utf_string_conversions.h" -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "port/port.h" -#include "util/logging.h" - -#if defined(OS_WIN) -#include -#include "base/win/win_util.h" -#endif - -#if defined(OS_MACOSX) || defined(OS_WIN) -// The following are glibc-specific -extern "C" { -size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { - return fread(ptr, size, n, file); -} - -size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) { - return fwrite(ptr, size, n, file); -} - -int fflush_unlocked(FILE *file) { - return fflush(file); -} - -int fdatasync(int fildes) { -#if defined(OS_WIN) - return _commit(fildes); -#else - return fsync(fildes); -#endif -} -} -#endif - -namespace leveldb { - -namespace { - -class Thread; - -static const ::FilePath::CharType kLevelDBTestDirectoryPrefix[] - = FILE_PATH_LITERAL("leveldb-test-"); - -::FilePath CreateFilePath(const std::string& file_path) { -#if defined(OS_WIN) - return FilePath(UTF8ToUTF16(file_path)); -#else - return FilePath(file_path); -#endif -} - -std::string FilePathToString(const ::FilePath& file_path) { -#if defined(OS_WIN) - return UTF16ToUTF8(file_path.value()); -#else - return file_path.value(); -#endif -} - -// TODO(jorlow): This should be moved into Chromium's base. -const char* PlatformFileErrorString(const ::base::PlatformFileError& error) { - switch (error) { - case ::base::PLATFORM_FILE_ERROR_FAILED: - return "Opening file failed."; - case ::base::PLATFORM_FILE_ERROR_IN_USE: - return "File currently in use."; - case ::base::PLATFORM_FILE_ERROR_EXISTS: - return "File already exists."; - case ::base::PLATFORM_FILE_ERROR_NOT_FOUND: - return "File not found."; - case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED: - return "Access denied."; - case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED: - return "Too many files open."; - case ::base::PLATFORM_FILE_ERROR_NO_MEMORY: - return "Out of memory."; - case ::base::PLATFORM_FILE_ERROR_NO_SPACE: - return "No space left on drive."; - case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY: - return "Not a directory."; - case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION: - return "Invalid operation."; - case ::base::PLATFORM_FILE_ERROR_SECURITY: - return "Security error."; - case ::base::PLATFORM_FILE_ERROR_ABORT: - return "File operation aborted."; - case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE: - return "The supplied path was not a file."; - case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY: - return "The file was not empty."; - } - NOTIMPLEMENTED(); - return "Unknown error."; -} - -class ChromiumSequentialFile: public SequentialFile { - private: - std::string filename_; - FILE* file_; - - public: - ChromiumSequentialFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - virtual ~ChromiumSequentialFile() { fclose(file_); } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - Status s; - size_t r = fread_unlocked(scratch, 1, n, file_); - *result = Slice(scratch, r); - if (r < n) { - if (feof(file_)) { - // We leave status as ok if we hit the end of the file - } else { - // A partial read with an error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - } - return s; - } -}; - -class ChromiumRandomAccessFile: public RandomAccessFile { - private: - std::string filename_; - ::base::PlatformFile file_; - - public: - ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file) - : filename_(fname), file_(file) { } - virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; - int r = ::base::ReadPlatformFile(file_, offset, scratch, n); - *result = Slice(scratch, (r < 0) ? 0 : r); - if (r < 0) { - // An error: return a non-ok status - s = Status::IOError(filename_, "Could not preform read"); - } - return s; - } -}; - -class ChromiumWritableFile : public WritableFile { - private: - std::string filename_; - FILE* file_; - - public: - ChromiumWritableFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - - ~ChromiumWritableFile() { - if (file_ != NULL) { - // Ignoring any potential errors - fclose(file_); - } - } - - virtual Status Append(const Slice& data) { - size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); - Status result; - if (r != data.size()) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } - - virtual Status Close() { - Status result; - if (fclose(file_) != 0) { - result = Status::IOError(filename_, strerror(errno)); - } - file_ = NULL; - return result; - } - - virtual Status Flush() { - Status result; - if (fflush_unlocked(file_) != 0) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } - - virtual Status Sync() { - Status result; - if ((fflush_unlocked(file_) != 0) || - (fdatasync(fileno(file_)) != 0)) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } -}; - -class ChromiumFileLock : public FileLock { - public: - ::base::PlatformFile file_; -}; - -class ChromiumEnv : public Env { - public: - ChromiumEnv(); - virtual ~ChromiumEnv() { - fprintf(stderr, "Destroying Env::Default()\n"); - exit(1); - } - - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) { - FILE* f = fopen(fname.c_str(), "rb"); - if (f == NULL) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } else { - *result = new ChromiumSequentialFile(fname, f); - return Status::OK(); - } - } - - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) { - int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN; - bool created; - ::base::PlatformFileError error_code; - ::base::PlatformFile file = ::base::CreatePlatformFile( - CreateFilePath(fname), flags, &created, &error_code); - if (error_code != ::base::PLATFORM_FILE_OK) { - *result = NULL; - return Status::IOError(fname, PlatformFileErrorString(error_code)); - } - *result = new ChromiumRandomAccessFile(fname, file); - return Status::OK(); - } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - *result = NULL; - FILE* f = fopen(fname.c_str(), "wb"); - if (f == NULL) { - return Status::IOError(fname, strerror(errno)); - } else { - *result = new ChromiumWritableFile(fname, f); - return Status::OK(); - } - } - - virtual bool FileExists(const std::string& fname) { - return ::file_util::PathExists(CreateFilePath(fname)); - } - - virtual Status GetChildren(const std::string& dir, - std::vector* result) { - result->clear(); - ::file_util::FileEnumerator iter( - CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES); - ::FilePath current = iter.Next(); - while (!current.empty()) { - result->push_back(FilePathToString(current.BaseName())); - current = iter.Next(); - } - // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so - // we'll always return OK. Maybe manually check for error - // conditions like the file not existing? - return Status::OK(); - } - - virtual Status DeleteFile(const std::string& fname) { - Status result; - // TODO(jorlow): Should we assert this is a file? - if (!::file_util::Delete(CreateFilePath(fname), false)) { - result = Status::IOError(fname, "Could not delete file."); - } - return result; - }; - - virtual Status CreateDir(const std::string& name) { - Status result; - if (!::file_util::CreateDirectory(CreateFilePath(name))) { - result = Status::IOError(name, "Could not create directory."); - } - return result; - }; - - virtual Status DeleteDir(const std::string& name) { - Status result; - // TODO(jorlow): Should we assert this is a directory? - if (!::file_util::Delete(CreateFilePath(name), false)) { - result = Status::IOError(name, "Could not delete directory."); - } - return result; - }; - - virtual Status GetFileSize(const std::string& fname, uint64_t* size) { - Status s; - int64_t signed_size; - if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { - *size = 0; - s = Status::IOError(fname, "Could not determine file size."); - } else { - *size = static_cast(signed_size); - } - return s; - } - - virtual Status RenameFile(const std::string& src, const std::string& dst) { - Status result; - if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) { - result = Status::IOError(src, "Could not rename file."); - } - return result; - } - - virtual Status LockFile(const std::string& fname, FileLock** lock) { - *lock = NULL; - Status result; - int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS | - ::base::PLATFORM_FILE_READ | - ::base::PLATFORM_FILE_WRITE | - ::base::PLATFORM_FILE_EXCLUSIVE_READ | - ::base::PLATFORM_FILE_EXCLUSIVE_WRITE; - bool created; - ::base::PlatformFileError error_code; - ::base::PlatformFile file = ::base::CreatePlatformFile( - CreateFilePath(fname), flags, &created, &error_code); - if (error_code != ::base::PLATFORM_FILE_OK) { - result = Status::IOError(fname, PlatformFileErrorString(error_code)); - } else { - ChromiumFileLock* my_lock = new ChromiumFileLock; - my_lock->file_ = file; - *lock = my_lock; - } - return result; - } - - virtual Status UnlockFile(FileLock* lock) { - ChromiumFileLock* my_lock = reinterpret_cast(lock); - Status result; - if (!::base::ClosePlatformFile(my_lock->file_)) { - result = Status::IOError("Could not close lock file."); - } - delete my_lock; - return result; - } - - virtual void Schedule(void (*function)(void*), void* arg); - - virtual void StartThread(void (*function)(void* arg), void* arg); - - virtual std::string UserIdentifier() { -#if defined(OS_WIN) - std::wstring user_sid; - bool ret = ::base::win::GetUserSidString(&user_sid); - DCHECK(ret); - return UTF16ToUTF8(user_sid); -#else - char buf[100]; - snprintf(buf, sizeof(buf), "%d", int(geteuid())); - return buf; -#endif - } - - virtual Status GetTestDirectory(std::string* path) { - mu_.Acquire(); - if (test_directory_.empty()) { - if (!::file_util::CreateNewTempDirectory(kLevelDBTestDirectoryPrefix, - &test_directory_)) { - mu_.Release(); - return Status::IOError("Could not create temp directory."); - } - } - *path = FilePathToString(test_directory_); - mu_.Release(); - return Status::OK(); - } - - virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { - // TODO(jorlow): We may want to just use Chromium's built in logging. - - uint64_t thread_id = 0; - // Coppied from base/logging.cc. -#if defined(OS_WIN) - thread_id = GetCurrentThreadId(); -#elif defined(OS_MACOSX) - thread_id = mach_thread_self(); -#elif defined(OS_LINUX) - thread_id = syscall(__NR_gettid); -#elif defined(OS_FREEBSD) || defined(OS_NACL) - // TODO(BSD): find a better thread ID - pthread_t tid = pthread_self(); - memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); -#endif - - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - ::base::Time::Exploded t; - ::base::Time::Now().LocalExplode(&t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.year, - t.month, - t.day_of_month, - t.hour, - t.minute, - t.second, - static_cast(t.millisecond) * 1000, - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - info_log->Append(Slice(base, p - base)); - info_log->Flush(); - if (base != buffer) { - delete[] base; - } - break; - } - } - - virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) { - ::base::Time::Exploded t; - ::base::Time::Now().LocalExplode(&t); - return snprintf(buffer, size, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d", - t.year, - t.month, - t.day_of_month, - t.hour, - t.minute, - t.second, - static_cast(t.millisecond) * 1000); - } - - virtual uint64_t NowMicros() { - return ::base::TimeTicks::HighResNow().ToInternalValue(); - } - - virtual void SleepForMicroseconds(int micros) { - // Round up to the next millisecond. - ::base::PlatformThread::Sleep((micros + 999) / 1000); - } - - private: - // BGThread() is the body of the background thread - void BGThread(); - static void BGThreadWrapper(void* arg) { - reinterpret_cast(arg)->BGThread(); - } - - FilePath test_directory_; - - size_t page_size_; - ::base::Lock mu_; - ::base::ConditionVariable bgsignal_; - bool started_bgthread_; - - // Entry per Schedule() call - struct BGItem { void* arg; void (*function)(void*); }; - typedef std::deque BGQueue; - BGQueue queue_; -}; - -ChromiumEnv::ChromiumEnv() - : page_size_(::base::SysInfo::VMAllocationGranularity()), - bgsignal_(&mu_), - started_bgthread_(false) { -#if defined(OS_MACOSX) - ::base::EnableTerminationOnHeapCorruption(); - ::base::EnableTerminationOnOutOfMemory(); -#endif // OS_MACOSX -} - -class Thread : public ::base::PlatformThread::Delegate { - public: - Thread(void (*function)(void* arg), void* arg) - : function_(function), arg_(arg) { - ::base::PlatformThreadHandle handle; - bool success = ::base::PlatformThread::Create(0, this, &handle); - DCHECK(success); - } - virtual ~Thread() {} - virtual void ThreadMain() { - (*function_)(arg_); - delete this; - } - - private: - void (*function_)(void* arg); - void* arg_; -}; - -void ChromiumEnv::Schedule(void (*function)(void*), void* arg) { - mu_.Acquire(); - - // Start background thread if necessary - if (!started_bgthread_) { - started_bgthread_ = true; - StartThread(&ChromiumEnv::BGThreadWrapper, this); - } - - // If the queue is currently empty, the background thread may currently be - // waiting. - if (queue_.empty()) { - bgsignal_.Signal(); - } - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - - mu_.Release(); -} - -void ChromiumEnv::BGThread() { - while (true) { - // Wait until there is an item that is ready to run - mu_.Acquire(); - while (queue_.empty()) { - bgsignal_.Wait(); - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - - mu_.Release(); - (*function)(arg); - } -} - -void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { - new Thread(function, arg); // Will self-delete. -} - -::base::LazyInstance > - default_env(::base::LINKER_INITIALIZED); - -} - -Env* Env::Default() { - return default_env.Pointer(); -} - -} diff --git a/util/env_posix.cc b/util/env_posix.cc deleted file mode 100644 index 5cddb0c..0000000 --- a/util/env_posix.cc +++ /dev/null @@ -1,599 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(LEVELDB_PLATFORM_ANDROID) -#include -#endif -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "port/port.h" -#include "util/logging.h" - -namespace leveldb { - -namespace { - -class PosixSequentialFile: public SequentialFile { - private: - std::string filename_; - FILE* file_; - - public: - PosixSequentialFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - virtual ~PosixSequentialFile() { fclose(file_); } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - Status s; - size_t r = fread_unlocked(scratch, 1, n, file_); - *result = Slice(scratch, r); - if (r < n) { - if (feof(file_)) { - // We leave status as ok if we hit the end of the file - } else { - // A partial read with an error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - } - return s; - } -}; - -class PosixRandomAccessFile: public RandomAccessFile { - private: - std::string filename_; - int fd_; - - public: - PosixRandomAccessFile(const std::string& fname, int fd) - : filename_(fname), fd_(fd) { } - virtual ~PosixRandomAccessFile() { close(fd_); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; - ssize_t r = pread(fd_, scratch, n, static_cast(offset)); - *result = Slice(scratch, (r < 0) ? 0 : r); - if (r < 0) { - // An error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - return s; - } -}; - -// We preallocate up to an extra megabyte and use memcpy to append new -// data to the file. This is safe since we either properly close the -// file before reading from it, or for log files, the reading code -// knows enough to skip zero suffixes. -class PosixMmapFile : public WritableFile { - private: - std::string filename_; - int fd_; - size_t page_size_; - size_t map_size_; // How much extra memory to map at a time - char* base_; // The mapped region - char* limit_; // Limit of the mapped region - char* dst_; // Where to write next (in range [base_,limit_]) - char* last_sync_; // Where have we synced up to - uint64_t file_offset_; // Offset of base_ in file - - // Have we done an munmap of unsynced data? - bool pending_sync_; - - // Roundup x to a multiple of y - static size_t Roundup(size_t x, size_t y) { - return ((x + y - 1) / y) * y; - } - - size_t TruncateToPageBoundary(size_t s) { - s -= (s & (page_size_ - 1)); - assert((s % page_size_) == 0); - return s; - } - - void UnmapCurrentRegion() { - if (base_ != NULL) { - if (last_sync_ < limit_) { - // Defer syncing this data until next Sync() call, if any - pending_sync_ = true; - } - munmap(base_, limit_ - base_); - file_offset_ += limit_ - base_; - base_ = NULL; - limit_ = NULL; - last_sync_ = NULL; - dst_ = NULL; - - // Increase the amount we map the next time, but capped at 1MB - if (map_size_ < (1<<20)) { - map_size_ *= 2; - } - } - } - - bool MapNewRegion() { - assert(base_ == NULL); - if (ftruncate(fd_, file_offset_ + map_size_) < 0) { - return false; - } - void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, - fd_, file_offset_); - if (ptr == MAP_FAILED) { - return false; - } - base_ = reinterpret_cast(ptr); - limit_ = base_ + map_size_; - dst_ = base_; - last_sync_ = base_; - return true; - } - - public: - PosixMmapFile(const std::string& fname, int fd, size_t page_size) - : filename_(fname), - fd_(fd), - page_size_(page_size), - map_size_(Roundup(65536, page_size)), - base_(NULL), - limit_(NULL), - dst_(NULL), - last_sync_(NULL), - file_offset_(0), - pending_sync_(false) { - assert((page_size & (page_size - 1)) == 0); - } - - - ~PosixMmapFile() { - if (fd_ >= 0) { - PosixMmapFile::Close(); - } - } - - virtual Status Append(const Slice& data) { - const char* src = data.data(); - size_t left = data.size(); - while (left > 0) { - assert(base_ <= dst_); - assert(dst_ <= limit_); - size_t avail = limit_ - dst_; - if (avail == 0) { - UnmapCurrentRegion(); - MapNewRegion(); - } - - size_t n = (left <= avail) ? left : avail; - memcpy(dst_, src, n); - dst_ += n; - src += n; - left -= n; - } - return Status::OK(); - } - - virtual Status Close() { - Status s; - size_t unused = limit_ - dst_; - UnmapCurrentRegion(); - if (unused > 0) { - // Trim the extra space at the end of the file - if (ftruncate(fd_, file_offset_ - unused) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - if (close(fd_) < 0) { - if (s.ok()) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - fd_ = -1; - base_ = NULL; - limit_ = NULL; - return s; - } - - virtual Status Flush() { - return Status::OK(); - } - - virtual Status Sync() { - Status s; - - if (pending_sync_) { - // Some unmapped data was not synced - pending_sync_ = false; - if (fdatasync(fd_) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - if (dst_ > last_sync_) { - // Find the beginnings of the pages that contain the first and last - // bytes to be synced. - size_t p1 = TruncateToPageBoundary(last_sync_ - base_); - size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); - last_sync_ = dst_; - if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - return s; - } -}; - -static int LockOrUnlock(int fd, bool lock) { - errno = 0; - struct flock f; - memset(&f, 0, sizeof(f)); - f.l_type = (lock ? F_WRLCK : F_UNLCK); - f.l_whence = SEEK_SET; - f.l_start = 0; - f.l_len = 0; // Lock/unlock entire file - return fcntl(fd, F_SETLK, &f); -} - -class PosixFileLock : public FileLock { - public: - int fd_; -}; - -class PosixEnv : public Env { - public: - PosixEnv(); - virtual ~PosixEnv() { - fprintf(stderr, "Destroying Env::Default()\n"); - exit(1); - } - - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) { - FILE* f = fopen(fname.c_str(), "r"); - if (f == NULL) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } else { - *result = new PosixSequentialFile(fname, f); - return Status::OK(); - } - } - - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) { - int fd = open(fname.c_str(), O_RDONLY); - if (fd < 0) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } - *result = new PosixRandomAccessFile(fname, fd); - return Status::OK(); - } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - Status s; - const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd < 0) { - *result = NULL; - s = Status::IOError(fname, strerror(errno)); - } else { - *result = new PosixMmapFile(fname, fd, page_size_); - } - return s; - } - - virtual bool FileExists(const std::string& fname) { - return access(fname.c_str(), F_OK) == 0; - } - - virtual Status GetChildren(const std::string& dir, - std::vector* result) { - result->clear(); - DIR* d = opendir(dir.c_str()); - if (d == NULL) { - return Status::IOError(dir, strerror(errno)); - } - struct dirent* entry; - while ((entry = readdir(d)) != NULL) { - result->push_back(entry->d_name); - } - closedir(d); - return Status::OK(); - } - - virtual Status DeleteFile(const std::string& fname) { - Status result; - if (unlink(fname.c_str()) != 0) { - result = Status::IOError(fname, strerror(errno)); - } - return result; - }; - - virtual Status CreateDir(const std::string& name) { - Status result; - if (mkdir(name.c_str(), 0755) != 0) { - result = Status::IOError(name, strerror(errno)); - } - return result; - }; - - virtual Status DeleteDir(const std::string& name) { - Status result; - if (rmdir(name.c_str()) != 0) { - result = Status::IOError(name, strerror(errno)); - } - return result; - }; - - virtual Status GetFileSize(const std::string& fname, uint64_t* size) { - Status s; - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - *size = 0; - s = Status::IOError(fname, strerror(errno)); - } else { - *size = sbuf.st_size; - } - return s; - } - - virtual Status RenameFile(const std::string& src, const std::string& target) { - Status result; - if (rename(src.c_str(), target.c_str()) != 0) { - result = Status::IOError(src, strerror(errno)); - } - return result; - } - - virtual Status LockFile(const std::string& fname, FileLock** lock) { - *lock = NULL; - Status result; - int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); - if (fd < 0) { - result = Status::IOError(fname, strerror(errno)); - } else if (LockOrUnlock(fd, true) == -1) { - result = Status::IOError("lock " + fname, strerror(errno)); - close(fd); - } else { - PosixFileLock* my_lock = new PosixFileLock; - my_lock->fd_ = fd; - *lock = my_lock; - } - return result; - } - - virtual Status UnlockFile(FileLock* lock) { - PosixFileLock* my_lock = reinterpret_cast(lock); - Status result; - if (LockOrUnlock(my_lock->fd_, false) == -1) { - result = Status::IOError(strerror(errno)); - } - close(my_lock->fd_); - delete my_lock; - return result; - } - - virtual void Schedule(void (*function)(void*), void* arg); - - virtual void StartThread(void (*function)(void* arg), void* arg); - - virtual Status GetTestDirectory(std::string* result) { - const char* env = getenv("TEST_TMPDIR"); - if (env && env[0] != '\0') { - *result = env; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); - *result = buf; - } - // Directory may already exist - CreateDir(*result); - return Status::OK(); - } - - virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { - pthread_t tid = pthread_self(); - uint64_t thread_id = 0; - memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); - - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - struct timeval now_tv; - gettimeofday(&now_tv, NULL); - const time_t seconds = now_tv.tv_sec; - struct tm t; - localtime_r(&seconds, &t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.tm_year + 1900, - t.tm_mon + 1, - t.tm_mday, - t.tm_hour, - t.tm_min, - t.tm_sec, - static_cast(now_tv.tv_usec), - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - info_log->Append(Slice(base, p - base)); - info_log->Flush(); - if (base != buffer) { - delete[] base; - } - break; - } - } - - virtual uint64_t NowMicros() { - struct timeval tv; - gettimeofday(&tv, NULL); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; - } - - virtual void SleepForMicroseconds(int micros) { - usleep(micros); - } - - private: - void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - exit(1); - } - } - - // BGThread() is the body of the background thread - void BGThread(); - static void* BGThreadWrapper(void* arg) { - reinterpret_cast(arg)->BGThread(); - return NULL; - } - - size_t page_size_; - pthread_mutex_t mu_; - pthread_cond_t bgsignal_; - pthread_t bgthread_; - bool started_bgthread_; - - // Entry per Schedule() call - struct BGItem { void* arg; void (*function)(void*); }; - typedef std::deque BGQueue; - BGQueue queue_; -}; - -PosixEnv::PosixEnv() : page_size_(getpagesize()), - started_bgthread_(false) { - PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); - PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); -} - -void PosixEnv::Schedule(void (*function)(void*), void* arg) { - PthreadCall("lock", pthread_mutex_lock(&mu_)); - - // Start background thread if necessary - if (!started_bgthread_) { - started_bgthread_ = true; - PthreadCall( - "create thread", - pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); - } - - // If the queue is currently empty, the background thread may currently be - // waiting. - if (queue_.empty()) { - PthreadCall("signal", pthread_cond_signal(&bgsignal_)); - } - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); -} - -void PosixEnv::BGThread() { - while (true) { - // Wait until there is an item that is ready to run - PthreadCall("lock", pthread_mutex_lock(&mu_)); - while (queue_.empty()) { - PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); - (*function)(arg); - } -} - -namespace { -struct StartThreadState { - void (*user_function)(void*); - void* arg; -}; -} -static void* StartThreadWrapper(void* arg) { - StartThreadState* state = reinterpret_cast(arg); - state->user_function(state->arg); - delete state; - return NULL; -} - -void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { - pthread_t t; - StartThreadState* state = new StartThreadState; - state->user_function = function; - state->arg = arg; - PthreadCall("start thread", - pthread_create(&t, NULL, &StartThreadWrapper, state)); -} - -} - -static pthread_once_t once = PTHREAD_ONCE_INIT; -static Env* default_env; -static void InitDefaultEnv() { default_env = new PosixEnv; } - -Env* Env::Default() { - pthread_once(&once, InitDefaultEnv); - return default_env; -} - -} diff --git a/util/env_test.cc b/util/env_test.cc deleted file mode 100644 index 3c253be..0000000 --- a/util/env_test.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/env.h" - -#include "port/port.h" -#include "util/testharness.h" - -namespace leveldb { - -static const int kDelayMicros = 100000; - -class EnvPosixTest { - private: - port::Mutex mu_; - std::string events_; - - public: - Env* env_; - EnvPosixTest() : env_(Env::Default()) { } -}; - -static void SetBool(void* ptr) { - *(reinterpret_cast(ptr)) = true; -} - -TEST(EnvPosixTest, RunImmediately) { - bool called = false; - env_->Schedule(&SetBool, &called); - Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_TRUE(called); -} - -TEST(EnvPosixTest, RunMany) { - int last_id = 0; - - struct CB { - int* last_id_ptr; // Pointer to shared slot - int id; // Order# for the execution of this callback - - CB(int* p, int i) : last_id_ptr(p), id(i) { } - - static void Run(void* v) { - CB* cb = reinterpret_cast(v); - ASSERT_EQ(cb->id-1, *cb->last_id_ptr); - *cb->last_id_ptr = cb->id; - } - }; - - // Schedule in different order than start time - CB cb1(&last_id, 1); - CB cb2(&last_id, 2); - CB cb3(&last_id, 3); - CB cb4(&last_id, 4); - env_->Schedule(&CB::Run, &cb1); - env_->Schedule(&CB::Run, &cb2); - env_->Schedule(&CB::Run, &cb3); - env_->Schedule(&CB::Run, &cb4); - - Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_EQ(4, last_id); -} - -struct State { - port::Mutex mu; - int val; - int num_running; -}; - -static void ThreadBody(void* arg) { - State* s = reinterpret_cast(arg); - s->mu.Lock(); - s->val += 1; - s->num_running -= 1; - s->mu.Unlock(); -} - -TEST(EnvPosixTest, StartThread) { - State state; - state.val = 0; - state.num_running = 3; - for (int i = 0; i < 3; i++) { - env_->StartThread(&ThreadBody, &state); - } - while (true) { - state.mu.Lock(); - int num = state.num_running; - state.mu.Unlock(); - if (num == 0) { - break; - } - Env::Default()->SleepForMicroseconds(kDelayMicros); - } - ASSERT_EQ(state.val, 3); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/util/hash.cc b/util/hash.cc deleted file mode 100644 index d19afd1..0000000 --- a/util/hash.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "util/coding.h" -#include "util/hash.h" - -namespace leveldb { - -uint32_t Hash(const char* data, size_t n, uint32_t seed) { - // Similar to murmur hash - const uint32_t m = 0xc6a4a793; - const uint32_t r = 24; - const char* limit = data + n; - uint32_t h = seed ^ (n * m); - - // Pick up four bytes at a time - while (data + 4 <= limit) { - uint32_t w = DecodeFixed32(data); - data += 4; - h += w; - h *= m; - h ^= (h >> 16); - } - - // Pick up remaining bytes - switch (limit - data) { - case 3: - h += data[2] << 16; - // fall through - case 2: - h += data[1] << 8; - // fall through - case 1: - h += data[0]; - h *= m; - h ^= (h >> r); - break; - } - return h; -} - - -} diff --git a/util/hash.h b/util/hash.h deleted file mode 100644 index 8889d56..0000000 --- a/util/hash.h +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Simple hash function used for internal data structures - -#ifndef STORAGE_LEVELDB_UTIL_HASH_H_ -#define STORAGE_LEVELDB_UTIL_HASH_H_ - -#include -#include - -namespace leveldb { - -extern uint32_t Hash(const char* data, size_t n, uint32_t seed); - -} - -#endif // STORAGE_LEVELDB_UTIL_HASH_H_ diff --git a/util/histogram.cc b/util/histogram.cc deleted file mode 100644 index c5178ef..0000000 --- a/util/histogram.cc +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include "port/port.h" -#include "util/histogram.h" - -namespace leveldb { - -const double Histogram::kBucketLimit[kNumBuckets] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, - 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, - 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, - 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, - 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, - 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, - 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, - 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, - 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, - 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, - 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, - 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, - 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, - 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, - 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000, - 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0, - 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0, - 1e200, -}; - -void Histogram::Clear() { - min_ = kBucketLimit[kNumBuckets-1]; - max_ = 0; - num_ = 0; - sum_ = 0; - sum_squares_ = 0; - for (int i = 0; i < kNumBuckets; i++) { - buckets_[i] = 0; - } -} - -void Histogram::Add(double value) { - // Linear search is fast enough for our usage in db_bench - int b = 0; - while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { - b++; - } - buckets_[b] += 1.0; - if (min_ > value) min_ = value; - if (max_ < value) max_ = value; - num_++; - sum_ += value; - sum_squares_ += (value * value); -} - -double Histogram::Median() const { - return Percentile(50.0); -} - -double Histogram::Percentile(double p) const { - double threshold = num_ * (p / 100.0); - double sum = 0; - for (int b = 0; b < kNumBuckets; b++) { - sum += buckets_[b]; - if (sum >= threshold) { - // Scale linearly within this bucket - double left_point = (b == 0) ? 0 : kBucketLimit[b-1]; - double right_point = kBucketLimit[b]; - double left_sum = sum - buckets_[b]; - double right_sum = sum; - double pos = (threshold - left_sum) / (right_sum - left_sum); - double r = left_point + (right_point - left_point) * pos; - if (r < min_) r = min_; - if (r > max_) r = max_; - return r; - } - } - return max_; -} - -double Histogram::Average() const { - if (num_ == 0.0) return 0; - return sum_ / num_; -} - -double Histogram::StandardDeviation() const { - if (num_ == 0.0) return 0; - double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); - return sqrt(variance); -} - -std::string Histogram::ToString() const { - std::string r; - char buf[200]; - snprintf(buf, sizeof(buf), - "Count: %.0f Average: %.4f StdDev: %.2f\n", - num_, Average(), StandardDeviation()); - r.append(buf); - snprintf(buf, sizeof(buf), - "Min: %.4f Median: %.4f Max: %.4f\n", - (num_ == 0.0 ? 0.0 : min_), Median(), max_); - r.append(buf); - r.append("------------------------------------------------------\n"); - const double mult = 100.0 / num_; - double sum = 0; - for (int b = 0; b < kNumBuckets; b++) { - if (buckets_[b] <= 0.0) continue; - sum += buckets_[b]; - snprintf(buf, sizeof(buf), - "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ", - ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left - kBucketLimit[b], // right - buckets_[b], // count - mult * buckets_[b], // percentage - mult * sum); // cumulative percentage - r.append(buf); - - // Add hash marks based on percentage; 20 marks for 100%. - int marks = static_cast(20*(buckets_[b] / num_) + 0.5); - r.append(marks, '#'); - r.push_back('\n'); - } - return r; -} - -} diff --git a/util/histogram.h b/util/histogram.h deleted file mode 100644 index f72f122..0000000 --- a/util/histogram.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ -#define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ - -#include - -namespace leveldb { - -class Histogram { - public: - Histogram() { } - ~Histogram() { } - - void Clear(); - void Add(double value); - - std::string ToString() const; - - private: - double min_; - double max_; - double num_; - double sum_; - double sum_squares_; - - enum { kNumBuckets = 154 }; - static const double kBucketLimit[kNumBuckets]; - double buckets_[kNumBuckets]; - - double Median() const; - double Percentile(double p) const; - double Average() const; - double StandardDeviation() const; -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ diff --git a/util/logging.cc b/util/logging.cc deleted file mode 100644 index 5c9bd4a..0000000 --- a/util/logging.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/logging.h" - -#include -#include -#include -#include -#include "leveldb/env.h" -#include "leveldb/slice.h" - -namespace leveldb { - -void AppendNumberTo(std::string* str, uint64_t num) { - char buf[30]; - snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); - str->append(buf); -} - -void AppendEscapedStringTo(std::string* str, const Slice& value) { - for (int i = 0; i < value.size(); i++) { - char c = value[i]; - if (c >= ' ' && c <= '~') { - str->push_back(c); - } else { - char buf[10]; - snprintf(buf, sizeof(buf), "\\x%02x", - static_cast(c) & 0xff); - str->append(buf); - } - } -} - -std::string NumberToString(uint64_t num) { - std::string r; - AppendNumberTo(&r, num); - return r; -} - -std::string EscapeString(const Slice& value) { - std::string r; - AppendEscapedStringTo(&r, value); - return r; -} - -bool ConsumeChar(Slice* in, char c) { - if (!in->empty() && (*in)[0] == c) { - in->remove_prefix(1); - return true; - } else { - return false; - } -} - -bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { - uint64_t v = 0; - int digits = 0; - while (!in->empty()) { - char c = (*in)[0]; - if (c >= '0' && c <= '9') { - ++digits; - const int delta = (c - '0'); - static const uint64_t kMaxUint64 = ~static_cast(0); - if (v > kMaxUint64/10 || - (v == kMaxUint64/10 && delta > kMaxUint64%10)) { - // Overflow - return false; - } - v = (v * 10) + delta; - in->remove_prefix(1); - } else { - break; - } - } - *val = v; - return (digits > 0); -} - -} diff --git a/util/logging.h b/util/logging.h deleted file mode 100644 index 1cd0a4b..0000000 --- a/util/logging.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Must not be included from any .h files to avoid polluting the namespace -// with macros. - -#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ -#define STORAGE_LEVELDB_UTIL_LOGGING_H_ - -#include -#include -#include -#include "port/port.h" - -namespace leveldb { - -class Slice; -class WritableFile; - -// Append a human-readable printout of "num" to *str -extern void AppendNumberTo(std::string* str, uint64_t num); - -// Append a human-readable printout of "value" to *str. -// Escapes any non-printable characters found in "value". -extern void AppendEscapedStringTo(std::string* str, const Slice& value); - -// Return a human-readable printout of "num" -extern std::string NumberToString(uint64_t num); - -// Return a human-readable version of "value". -// Escapes any non-printable characters found in "value". -extern std::string EscapeString(const Slice& value); - -// If *in starts with "c", advances *in past the first character and -// returns true. Otherwise, returns false. -extern bool ConsumeChar(Slice* in, char c); - -// Parse a human-readable number from "*in" into *value. On success, -// advances "*in" past the consumed number and sets "*val" to the -// numeric value. Otherwise, returns false and leaves *in in an -// unspecified state. -extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); - -} - -#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/util/mutexlock.h b/util/mutexlock.h deleted file mode 100644 index 05fe279..0000000 --- a/util/mutexlock.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ -#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ - -#include "port/port.h" - -namespace leveldb { - -// Helper class that locks a mutex on construction and unlocks the mutex when -// the destructor of the MutexLock object is invoked. -// -// Typical usage: -// -// void MyClass::MyMethod() { -// MutexLock l(&mu_); // mu_ is an instance variable -// ... some complex code, possibly with multiple return paths ... -// } - -class MutexLock { - public: - explicit MutexLock(port::Mutex *mu) : mu_(mu) { - this->mu_->Lock(); - } - ~MutexLock() { this->mu_->Unlock(); } - - private: - port::Mutex *const mu_; - // No copying allowed - MutexLock(const MutexLock&); - void operator=(const MutexLock&); -}; - -} - - -#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ diff --git a/util/options.cc b/util/options.cc deleted file mode 100644 index 29272fe..0000000 --- a/util/options.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/options.h" - -#include "leveldb/comparator.h" -#include "leveldb/env.h" - -namespace leveldb { - -Options::Options() - : comparator(BytewiseComparator()), - create_if_missing(false), - error_if_exists(false), - paranoid_checks(false), - env(Env::Default()), - info_log(NULL), - write_buffer_size(4<<20), - max_open_files(1000), - large_value_threshold(65536), - block_cache(NULL), - block_size(4096), - block_restart_interval(16), - compression(kSnappyCompression) { -} - - -} diff --git a/util/random.h b/util/random.h deleted file mode 100644 index 2d458e8..0000000 --- a/util/random.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ -#define STORAGE_LEVELDB_UTIL_RANDOM_H_ - -#include - -namespace leveldb { - -// A very simple random number generator. Not especially good at -// generating truly random bits, but good enough for our needs in this -// package. -class Random { - private: - uint32_t seed_; - public: - explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } - uint32_t Next() { - static const uint32_t M = 2147483647L; // 2^31-1 - static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 - // We are computing - // seed_ = (seed_ * A) % M, where M = 2^31-1 - // - // seed_ must not be zero or M, or else all subsequent computed values - // will be zero or M respectively. For all other values, seed_ will end - // up cycling through every number in [1,M-1] - uint64_t product = seed_ * A; - - // Compute (product % M) using the fact that ((x << 31) % M) == x. - seed_ = (product >> 31) + (product & M); - // The first reduction may overflow by 1 bit, so we may need to - // repeat. mod == M is not possible; using > allows the faster - // sign-bit-based test. - if (seed_ > M) { - seed_ -= M; - } - return seed_; - } - // Returns a uniformly distributed value in the range [0..n-1] - // REQUIRES: n > 0 - uint32_t Uniform(int n) { return Next() % n; } - - // Randomly returns true ~"1/n" of the time, and false otherwise. - // REQUIRES: n > 0 - bool OneIn(int n) { return (Next() % n) == 0; } - - // Skewed: pick "base" uniformly from range [0,max_log] and then - // return "base" random bits. The effect is to pick a number in the - // range [0,2^max_log-1] with exponential bias towards smaller numbers. - uint32_t Skewed(int max_log) { - return Uniform(1 << Uniform(max_log + 1)); - } -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/util/status.cc b/util/status.cc deleted file mode 100644 index d9b7195..0000000 --- a/util/status.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "port/port.h" -#include "leveldb/status.h" - -namespace leveldb { - -Status::Status(Code code, const Slice& msg, const Slice& msg2) { - assert(code != kOk); - state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); - if (!msg2.empty()) { - state_->second.append(": "); - state_->second.append(msg2.data(), msg2.size()); - } -} - -std::string Status::ToString() const { - if (state_ == NULL) { - return "OK"; - } else { - char tmp[30]; - const char* type; - switch (state_->first) { - case kOk: - type = "OK"; - break; - case kNotFound: - type = "NotFound"; - break; - case kCorruption: - type = "Corruption: "; - break; - case kNotSupported: - type = "Not implemented: "; - break; - case kInvalidArgument: - type = "Invalid argument: "; - break; - case kIOError: - type = "IO error: "; - break; - default: - snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", - static_cast(state_->first)); - type = tmp; - break; - } - std::string result(type); - if (!state_->second.empty()) { - result.append(state_->second); - } - return result; - } -} - -} diff --git a/util/testharness.cc b/util/testharness.cc deleted file mode 100644 index b686ac3..0000000 --- a/util/testharness.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/testharness.h" - -#include -#include - -namespace leveldb { -namespace test { - -namespace { -struct Test { - const char* base; - const char* name; - void (*func)(); -}; -std::vector* tests; -} - -bool RegisterTest(const char* base, const char* name, void (*func)()) { - if (tests == NULL) { - tests = new std::vector; - } - Test t; - t.base = base; - t.name = name; - t.func = func; - tests->push_back(t); - return true; -} - -int RunAllTests() { - int num = 0; - if (tests != NULL) { - for (int i = 0; i < tests->size(); i++) { - const Test& t = (*tests)[i]; - fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); - (*t.func)(); - ++num; - } - } - fprintf(stderr, "==== PASSED %d tests\n", num); - return 0; -} - -std::string TmpDir() { - std::string dir; - Status s = Env::Default()->GetTestDirectory(&dir); - ASSERT_TRUE(s.ok()) << s.ToString(); - return dir; -} - -int RandomSeed() { - const char* env = getenv("TEST_RANDOM_SEED"); - int result = (env != NULL ? atoi(env) : 301); - if (result <= 0) { - result = 301; - } - return result; -} - -} -} diff --git a/util/testharness.h b/util/testharness.h deleted file mode 100644 index 13ab914..0000000 --- a/util/testharness.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ -#define STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ - -#include -#include -#include -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "util/random.h" - -namespace leveldb { -namespace test { - -// Run all tests registered by the TEST() macro. -// Returns 0 if all tests pass. -// Dies or returns a non-zero value if some test fails. -extern int RunAllTests(); - -// Return the directory to use for temporary storage. -extern std::string TmpDir(); - -// Return a randomization seed for this run. Typically returns the -// same number on repeated invocations of this binary, but automated -// runs may be able to vary the seed. -extern int RandomSeed(); - -// An instance of Tester is allocated to hold temporary state during -// the execution of an assertion. -class Tester { - private: - bool ok_; - const char* fname_; - int line_; - std::stringstream ss_; - - public: - Tester(const char* f, int l) - : ok_(true), fname_(f), line_(l) { - } - - ~Tester() { - if (!ok_) { - fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); - exit(1); - } - } - - Tester& Is(bool b, const char* msg) { - if (!b) { - ss_ << " Assertion failure " << msg; - ok_ = false; - } - return *this; - } - - Tester& IsOk(const Status& s) { - if (!s.ok()) { - ss_ << " " << s.ToString(); - ok_ = false; - } - return *this; - } - -#define BINARY_OP(name,op) \ - template \ - Tester& name(const X& x, const Y& y) { \ - if (! (x op y)) { \ - ss_ << " failed: " << x << (" " #op " ") << y; \ - ok_ = false; \ - } \ - return *this; \ - } - - BINARY_OP(IsEq, ==) - BINARY_OP(IsNe, !=) - BINARY_OP(IsGe, >=) - BINARY_OP(IsGt, >) - BINARY_OP(IsLe, <=) - BINARY_OP(IsLt, <) -#undef BINARY_OP - - // Attach the specified value to the error message if an error has occurred - template - Tester& operator<<(const V& value) { - if (!ok_) { - ss_ << " " << value; - } - return *this; - } -}; - -#define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c) -#define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s)) -#define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) -#define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) -#define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) -#define ASSERT_GT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) -#define ASSERT_LE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) -#define ASSERT_LT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) - -#define TCONCAT(a,b) TCONCAT1(a,b) -#define TCONCAT1(a,b) a##b - -#define TEST(base,name) \ -class TCONCAT(_Test_,name) : public base { \ - public: \ - void _Run(); \ - static void _RunIt() { \ - TCONCAT(_Test_,name) t; \ - t._Run(); \ - } \ -}; \ -bool TCONCAT(_Test_ignored_,name) = \ - ::leveldb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ -void TCONCAT(_Test_,name)::_Run() - -// Register the specified test. Typically not used directly, but -// invoked via the macro expansion of TEST. -extern bool RegisterTest(const char* base, const char* name, void (*func)()); - - -} -} - -#endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ diff --git a/util/testutil.cc b/util/testutil.cc deleted file mode 100644 index 8d6cf3c..0000000 --- a/util/testutil.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/testutil.h" - -#include "util/random.h" - -namespace leveldb { -namespace test { - -Slice RandomString(Random* rnd, int len, std::string* dst) { - dst->resize(len); - for (int i = 0; i < len; i++) { - (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' - } - return Slice(*dst); -} - -std::string RandomKey(Random* rnd, int len) { - // Make sure to generate a wide variety of characters so we - // test the boundary conditions for short-key optimizations. - static const char kTestChars[] = { - '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' - }; - std::string result; - for (int i = 0; i < len; i++) { - result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; - } - return result; -} - - -extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst) { - int raw = static_cast(len * compressed_fraction); - if (raw < 1) raw = 1; - std::string raw_data; - RandomString(rnd, raw, &raw_data); - - // Duplicate the random data until we have filled "len" bytes - dst->clear(); - while (dst->size() < len) { - dst->append(raw_data); - } - dst->resize(len); - return Slice(*dst); -} - -} -} diff --git a/util/testutil.h b/util/testutil.h deleted file mode 100644 index a150c1a..0000000 --- a/util/testutil.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_ -#define STORAGE_LEVELDB_UTIL_TESTUTIL_H_ - -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "util/random.h" - -namespace leveldb { -namespace test { - -// Store in *dst a random string of length "len" and return a Slice that -// references the generated data. -extern Slice RandomString(Random* rnd, int len, std::string* dst); - -// Return a random key with the specified length that may contain interesting -// characters (e.g. \x00, \xff, etc.). -extern std::string RandomKey(Random* rnd, int len); - -// Store in *dst a string of length "len" that will compress to -// "N*compressed_fraction" bytes and return a Slice that references -// the generated data. -extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst); - -// A wrapper that allows injection of errors. -class ErrorEnv : public EnvWrapper { - public: - bool writable_file_error_; - int num_writable_file_errors_; - - ErrorEnv() : EnvWrapper(Env::Default()), - writable_file_error_(false), - num_writable_file_errors_(0) { } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - if (writable_file_error_) { - ++num_writable_file_errors_; - *result = NULL; - return Status::IOError(fname, "fake error"); - } - return target()->NewWritableFile(fname, result); - } -}; - -} -} - -#endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_ -- cgit v1.2.3 From 80d4a4601a20e75a6c4245244e3cac2cf444e012 Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Tue, 19 Apr 2011 23:11:15 +0000 Subject: reverting disastrous MOE commit, returning to r21 git-svn-id: http://leveldb.googlecode.com/svn/trunk@23 62dab493-f737-651d-591e-8d6aee1b9529 --- AUTHORS | 8 + LICENSE | 27 + Makefile | 134 +++ README | 51 ++ TODO | 14 + db/builder.cc | 99 +++ db/builder.h | 36 + db/corruption_test.cc | 378 +++++++++ db/db_bench.cc | 635 +++++++++++++++ db/db_impl.cc | 1345 +++++++++++++++++++++++++++++++ db/db_impl.h | 207 +++++ db/db_iter.cc | 397 +++++++++ db/db_iter.h | 26 + db/db_test.cc | 1211 ++++++++++++++++++++++++++++ db/dbformat.cc | 152 ++++ db/dbformat.h | 204 +++++ db/dbformat_test.cc | 127 +++ db/filename.cc | 154 ++++ db/filename.h | 92 +++ db/filename_test.cc | 156 ++++ db/log_format.h | 35 + db/log_reader.cc | 176 ++++ db/log_reader.h | 75 ++ db/log_test.cc | 361 +++++++++ db/log_writer.cc | 102 +++ db/log_writer.h | 48 ++ db/memtable.cc | 109 +++ db/memtable.h | 69 ++ db/repair.cc | 396 +++++++++ db/skiplist.h | 378 +++++++++ db/skiplist_test.cc | 378 +++++++++ db/snapshot.h | 66 ++ db/table_cache.cc | 95 +++ db/table_cache.h | 50 ++ db/version_edit.cc | 301 +++++++ db/version_edit.h | 124 +++ db/version_edit_test.cc | 50 ++ db/version_set.cc | 1120 +++++++++++++++++++++++++ db/version_set.h | 332 ++++++++ db/write_batch.cc | 164 ++++ db/write_batch_internal.h | 73 ++ db/write_batch_test.cc | 110 +++ doc/doc.css | 89 ++ doc/impl.html | 228 ++++++ doc/index.html | 509 ++++++++++++ doc/log_format.txt | 75 ++ doc/table_format.txt | 61 ++ include/leveldb/cache.h | 99 +++ include/leveldb/comparator.h | 61 ++ include/leveldb/db.h | 142 ++++ include/leveldb/env.h | 290 +++++++ include/leveldb/iterator.h | 95 +++ include/leveldb/options.h | 208 +++++ include/leveldb/slice.h | 104 +++ include/leveldb/status.h | 86 ++ include/leveldb/table.h | 69 ++ include/leveldb/table_builder.h | 86 ++ include/leveldb/write_batch.h | 49 ++ leveldb.gyp | 327 ++++++++ leveldb/AUTHORS | 8 - leveldb/LICENSE | 27 - leveldb/Makefile | 129 --- leveldb/README | 51 -- leveldb/TODO | 14 - leveldb/db/builder.cc | 90 --- leveldb/db/builder.h | 36 - leveldb/db/corruption_test.cc | 354 -------- leveldb/db/db_bench.cc | 613 -------------- leveldb/db/db_impl.cc | 1188 --------------------------- leveldb/db/db_impl.h | 184 ----- leveldb/db/db_iter.cc | 298 ------- leveldb/db/db_iter.h | 26 - leveldb/db/db_test.cc | 1030 ----------------------- leveldb/db/dbformat.cc | 87 -- leveldb/db/dbformat.h | 155 ---- leveldb/db/dbformat_test.cc | 112 --- leveldb/db/filename.cc | 135 ---- leveldb/db/filename.h | 80 -- leveldb/db/filename_test.cc | 122 --- leveldb/db/log_format.h | 35 - leveldb/db/log_reader.cc | 176 ---- leveldb/db/log_reader.h | 75 -- leveldb/db/log_test.cc | 361 --------- leveldb/db/log_writer.cc | 102 --- leveldb/db/log_writer.h | 48 -- leveldb/db/memtable.cc | 109 --- leveldb/db/memtable.h | 69 -- leveldb/db/repair.cc | 380 --------- leveldb/db/skiplist.h | 378 --------- leveldb/db/skiplist_test.cc | 378 --------- leveldb/db/snapshot.h | 66 -- leveldb/db/table_cache.cc | 95 --- leveldb/db/table_cache.h | 50 -- leveldb/db/version_edit.cc | 268 ------ leveldb/db/version_edit.h | 106 --- leveldb/db/version_edit_test.cc | 46 -- leveldb/db/version_set.cc | 1027 ----------------------- leveldb/db/version_set.h | 308 ------- leveldb/db/write_batch.cc | 148 ---- leveldb/db/write_batch_internal.h | 69 -- leveldb/db/write_batch_test.cc | 87 -- leveldb/doc/doc.css | 89 -- leveldb/doc/impl.html | 217 ----- leveldb/doc/index.html | 498 ------------ leveldb/doc/log_format.txt | 75 -- leveldb/doc/table_format.txt | 61 -- leveldb/include/leveldb/cache.h | 99 --- leveldb/include/leveldb/comparator.h | 61 -- leveldb/include/leveldb/db.h | 142 ---- leveldb/include/leveldb/env.h | 290 ------- leveldb/include/leveldb/iterator.h | 95 --- leveldb/include/leveldb/options.h | 198 ----- leveldb/include/leveldb/slice.h | 104 --- leveldb/include/leveldb/status.h | 86 -- leveldb/include/leveldb/table.h | 69 -- leveldb/include/leveldb/table_builder.h | 86 -- leveldb/include/leveldb/write_batch.h | 49 -- leveldb/leveldb.gyp | 315 -------- leveldb/port/README | 10 - leveldb/port/port.h | 21 - leveldb/port/port_android.cc | 64 -- leveldb/port/port_android.h | 150 ---- leveldb/port/port_chromium.cc | 80 -- leveldb/port/port_chromium.h | 97 --- leveldb/port/port_example.h | 115 --- leveldb/port/port_posix.cc | 50 -- leveldb/port/port_posix.h | 94 --- leveldb/port/win/stdint.h | 24 - leveldb/table/block.cc | 263 ------ leveldb/table/block.h | 43 - leveldb/table/block_builder.cc | 109 --- leveldb/table/block_builder.h | 57 -- leveldb/table/format.cc | 131 --- leveldb/table/format.h | 103 --- leveldb/table/iterator.cc | 68 -- leveldb/table/iterator_wrapper.h | 64 -- leveldb/table/merger.cc | 197 ----- leveldb/table/merger.h | 26 - leveldb/table/table.cc | 175 ---- leveldb/table/table_builder.cc | 227 ------ leveldb/table/table_test.cc | 841 ------------------- leveldb/table/two_level_iterator.cc | 182 ----- leveldb/table/two_level_iterator.h | 34 - leveldb/util/arena.cc | 68 -- leveldb/util/arena.h | 68 -- leveldb/util/arena_test.cc | 68 -- leveldb/util/cache.cc | 253 ------ leveldb/util/cache_test.cc | 169 ---- leveldb/util/coding.cc | 194 ----- leveldb/util/coding.h | 104 --- leveldb/util/coding_test.cc | 173 ---- leveldb/util/comparator.cc | 72 -- leveldb/util/crc32c.cc | 332 -------- leveldb/util/crc32c.h | 45 -- leveldb/util/crc32c_test.cc | 72 -- leveldb/util/env.cc | 77 -- leveldb/util/env_chromium.cc | 603 -------------- leveldb/util/env_posix.cc | 599 -------------- leveldb/util/env_test.cc | 102 --- leveldb/util/hash.cc | 45 -- leveldb/util/hash.h | 19 - leveldb/util/histogram.cc | 128 --- leveldb/util/histogram.h | 41 - leveldb/util/logging.cc | 81 -- leveldb/util/logging.h | 47 -- leveldb/util/mutexlock.h | 39 - leveldb/util/options.cc | 28 - leveldb/util/random.h | 59 -- leveldb/util/status.cc | 59 -- leveldb/util/testharness.cc | 65 -- leveldb/util/testharness.h | 129 --- leveldb/util/testutil.cc | 51 -- leveldb/util/testutil.h | 53 -- port/README | 10 + port/port.h | 21 + port/port_android.cc | 64 ++ port/port_android.h | 158 ++++ port/port_chromium.cc | 80 ++ port/port_chromium.h | 104 +++ port/port_example.h | 120 +++ port/port_posix.cc | 50 ++ port/port_posix.h | 99 +++ port/sha1_portable.cc | 298 +++++++ port/sha1_portable.h | 25 + port/sha1_test.cc | 39 + port/win/stdint.h | 24 + table/block.cc | 261 ++++++ table/block.h | 43 + table/block_builder.cc | 109 +++ table/block_builder.h | 57 ++ table/format.cc | 131 +++ table/format.h | 103 +++ table/iterator.cc | 68 ++ table/iterator_wrapper.h | 64 ++ table/merger.cc | 197 +++++ table/merger.h | 26 + table/table.cc | 175 ++++ table/table_builder.cc | 227 ++++++ table/table_test.cc | 841 +++++++++++++++++++ table/two_level_iterator.cc | 182 +++++ table/two_level_iterator.h | 34 + util/arena.cc | 68 ++ util/arena.h | 68 ++ util/arena_test.cc | 68 ++ util/cache.cc | 253 ++++++ util/cache_test.cc | 169 ++++ util/coding.cc | 194 +++++ util/coding.h | 104 +++ util/coding_test.cc | 173 ++++ util/comparator.cc | 72 ++ util/crc32c.cc | 332 ++++++++ util/crc32c.h | 45 ++ util/crc32c_test.cc | 72 ++ util/env.cc | 77 ++ util/env_chromium.cc | 603 ++++++++++++++ util/env_posix.cc | 599 ++++++++++++++ util/env_test.cc | 102 +++ util/hash.cc | 45 ++ util/hash.h | 19 + util/histogram.cc | 128 +++ util/histogram.h | 41 + util/logging.cc | 81 ++ util/logging.h | 47 ++ util/mutexlock.h | 39 + util/options.cc | 29 + util/random.h | 59 ++ util/status.cc | 59 ++ util/testharness.cc | 65 ++ util/testharness.h | 129 +++ util/testutil.cc | 51 ++ util/testutil.h | 53 ++ 231 files changed, 20097 insertions(+), 18722 deletions(-) create mode 100644 AUTHORS create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README create mode 100644 TODO create mode 100644 db/builder.cc create mode 100644 db/builder.h create mode 100644 db/corruption_test.cc create mode 100644 db/db_bench.cc create mode 100644 db/db_impl.cc create mode 100644 db/db_impl.h create mode 100644 db/db_iter.cc create mode 100644 db/db_iter.h create mode 100644 db/db_test.cc create mode 100644 db/dbformat.cc create mode 100644 db/dbformat.h create mode 100644 db/dbformat_test.cc create mode 100644 db/filename.cc create mode 100644 db/filename.h create mode 100644 db/filename_test.cc create mode 100644 db/log_format.h create mode 100644 db/log_reader.cc create mode 100644 db/log_reader.h create mode 100644 db/log_test.cc create mode 100644 db/log_writer.cc create mode 100644 db/log_writer.h create mode 100644 db/memtable.cc create mode 100644 db/memtable.h create mode 100644 db/repair.cc create mode 100644 db/skiplist.h create mode 100644 db/skiplist_test.cc create mode 100644 db/snapshot.h create mode 100644 db/table_cache.cc create mode 100644 db/table_cache.h create mode 100644 db/version_edit.cc create mode 100644 db/version_edit.h create mode 100644 db/version_edit_test.cc create mode 100644 db/version_set.cc create mode 100644 db/version_set.h create mode 100644 db/write_batch.cc create mode 100644 db/write_batch_internal.h create mode 100644 db/write_batch_test.cc create mode 100644 doc/doc.css create mode 100644 doc/impl.html create mode 100644 doc/index.html create mode 100644 doc/log_format.txt create mode 100644 doc/table_format.txt create mode 100644 include/leveldb/cache.h create mode 100644 include/leveldb/comparator.h create mode 100644 include/leveldb/db.h create mode 100644 include/leveldb/env.h create mode 100644 include/leveldb/iterator.h create mode 100644 include/leveldb/options.h create mode 100644 include/leveldb/slice.h create mode 100644 include/leveldb/status.h create mode 100644 include/leveldb/table.h create mode 100644 include/leveldb/table_builder.h create mode 100644 include/leveldb/write_batch.h create mode 100644 leveldb.gyp delete mode 100644 leveldb/AUTHORS delete mode 100644 leveldb/LICENSE delete mode 100644 leveldb/Makefile delete mode 100644 leveldb/README delete mode 100644 leveldb/TODO delete mode 100644 leveldb/db/builder.cc delete mode 100644 leveldb/db/builder.h delete mode 100644 leveldb/db/corruption_test.cc delete mode 100644 leveldb/db/db_bench.cc delete mode 100644 leveldb/db/db_impl.cc delete mode 100644 leveldb/db/db_impl.h delete mode 100644 leveldb/db/db_iter.cc delete mode 100644 leveldb/db/db_iter.h delete mode 100644 leveldb/db/db_test.cc delete mode 100644 leveldb/db/dbformat.cc delete mode 100644 leveldb/db/dbformat.h delete mode 100644 leveldb/db/dbformat_test.cc delete mode 100644 leveldb/db/filename.cc delete mode 100644 leveldb/db/filename.h delete mode 100644 leveldb/db/filename_test.cc delete mode 100644 leveldb/db/log_format.h delete mode 100644 leveldb/db/log_reader.cc delete mode 100644 leveldb/db/log_reader.h delete mode 100644 leveldb/db/log_test.cc delete mode 100644 leveldb/db/log_writer.cc delete mode 100644 leveldb/db/log_writer.h delete mode 100644 leveldb/db/memtable.cc delete mode 100644 leveldb/db/memtable.h delete mode 100644 leveldb/db/repair.cc delete mode 100644 leveldb/db/skiplist.h delete mode 100644 leveldb/db/skiplist_test.cc delete mode 100644 leveldb/db/snapshot.h delete mode 100644 leveldb/db/table_cache.cc delete mode 100644 leveldb/db/table_cache.h delete mode 100644 leveldb/db/version_edit.cc delete mode 100644 leveldb/db/version_edit.h delete mode 100644 leveldb/db/version_edit_test.cc delete mode 100644 leveldb/db/version_set.cc delete mode 100644 leveldb/db/version_set.h delete mode 100644 leveldb/db/write_batch.cc delete mode 100644 leveldb/db/write_batch_internal.h delete mode 100644 leveldb/db/write_batch_test.cc delete mode 100644 leveldb/doc/doc.css delete mode 100644 leveldb/doc/impl.html delete mode 100644 leveldb/doc/index.html delete mode 100644 leveldb/doc/log_format.txt delete mode 100644 leveldb/doc/table_format.txt delete mode 100644 leveldb/include/leveldb/cache.h delete mode 100644 leveldb/include/leveldb/comparator.h delete mode 100644 leveldb/include/leveldb/db.h delete mode 100644 leveldb/include/leveldb/env.h delete mode 100644 leveldb/include/leveldb/iterator.h delete mode 100644 leveldb/include/leveldb/options.h delete mode 100644 leveldb/include/leveldb/slice.h delete mode 100644 leveldb/include/leveldb/status.h delete mode 100644 leveldb/include/leveldb/table.h delete mode 100644 leveldb/include/leveldb/table_builder.h delete mode 100644 leveldb/include/leveldb/write_batch.h delete mode 100644 leveldb/leveldb.gyp delete mode 100644 leveldb/port/README delete mode 100644 leveldb/port/port.h delete mode 100644 leveldb/port/port_android.cc delete mode 100644 leveldb/port/port_android.h delete mode 100644 leveldb/port/port_chromium.cc delete mode 100644 leveldb/port/port_chromium.h delete mode 100644 leveldb/port/port_example.h delete mode 100644 leveldb/port/port_posix.cc delete mode 100644 leveldb/port/port_posix.h delete mode 100644 leveldb/port/win/stdint.h delete mode 100644 leveldb/table/block.cc delete mode 100644 leveldb/table/block.h delete mode 100644 leveldb/table/block_builder.cc delete mode 100644 leveldb/table/block_builder.h delete mode 100644 leveldb/table/format.cc delete mode 100644 leveldb/table/format.h delete mode 100644 leveldb/table/iterator.cc delete mode 100644 leveldb/table/iterator_wrapper.h delete mode 100644 leveldb/table/merger.cc delete mode 100644 leveldb/table/merger.h delete mode 100644 leveldb/table/table.cc delete mode 100644 leveldb/table/table_builder.cc delete mode 100644 leveldb/table/table_test.cc delete mode 100644 leveldb/table/two_level_iterator.cc delete mode 100644 leveldb/table/two_level_iterator.h delete mode 100644 leveldb/util/arena.cc delete mode 100644 leveldb/util/arena.h delete mode 100644 leveldb/util/arena_test.cc delete mode 100644 leveldb/util/cache.cc delete mode 100644 leveldb/util/cache_test.cc delete mode 100644 leveldb/util/coding.cc delete mode 100644 leveldb/util/coding.h delete mode 100644 leveldb/util/coding_test.cc delete mode 100644 leveldb/util/comparator.cc delete mode 100644 leveldb/util/crc32c.cc delete mode 100644 leveldb/util/crc32c.h delete mode 100644 leveldb/util/crc32c_test.cc delete mode 100644 leveldb/util/env.cc delete mode 100644 leveldb/util/env_chromium.cc delete mode 100644 leveldb/util/env_posix.cc delete mode 100644 leveldb/util/env_test.cc delete mode 100644 leveldb/util/hash.cc delete mode 100644 leveldb/util/hash.h delete mode 100644 leveldb/util/histogram.cc delete mode 100644 leveldb/util/histogram.h delete mode 100644 leveldb/util/logging.cc delete mode 100644 leveldb/util/logging.h delete mode 100644 leveldb/util/mutexlock.h delete mode 100644 leveldb/util/options.cc delete mode 100644 leveldb/util/random.h delete mode 100644 leveldb/util/status.cc delete mode 100644 leveldb/util/testharness.cc delete mode 100644 leveldb/util/testharness.h delete mode 100644 leveldb/util/testutil.cc delete mode 100644 leveldb/util/testutil.h create mode 100644 port/README create mode 100644 port/port.h create mode 100644 port/port_android.cc create mode 100644 port/port_android.h create mode 100644 port/port_chromium.cc create mode 100644 port/port_chromium.h create mode 100644 port/port_example.h create mode 100644 port/port_posix.cc create mode 100644 port/port_posix.h create mode 100644 port/sha1_portable.cc create mode 100644 port/sha1_portable.h create mode 100644 port/sha1_test.cc create mode 100644 port/win/stdint.h create mode 100644 table/block.cc create mode 100644 table/block.h create mode 100644 table/block_builder.cc create mode 100644 table/block_builder.h create mode 100644 table/format.cc create mode 100644 table/format.h create mode 100644 table/iterator.cc create mode 100644 table/iterator_wrapper.h create mode 100644 table/merger.cc create mode 100644 table/merger.h create mode 100644 table/table.cc create mode 100644 table/table_builder.cc create mode 100644 table/table_test.cc create mode 100644 table/two_level_iterator.cc create mode 100644 table/two_level_iterator.h create mode 100644 util/arena.cc create mode 100644 util/arena.h create mode 100644 util/arena_test.cc create mode 100644 util/cache.cc create mode 100644 util/cache_test.cc create mode 100644 util/coding.cc create mode 100644 util/coding.h create mode 100644 util/coding_test.cc create mode 100644 util/comparator.cc create mode 100644 util/crc32c.cc create mode 100644 util/crc32c.h create mode 100644 util/crc32c_test.cc create mode 100644 util/env.cc create mode 100644 util/env_chromium.cc create mode 100644 util/env_posix.cc create mode 100644 util/env_test.cc create mode 100644 util/hash.cc create mode 100644 util/hash.h create mode 100644 util/histogram.cc create mode 100644 util/histogram.h create mode 100644 util/logging.cc create mode 100644 util/logging.h create mode 100644 util/mutexlock.h create mode 100644 util/options.cc create mode 100644 util/random.h create mode 100644 util/status.cc create mode 100644 util/testharness.cc create mode 100644 util/testharness.h create mode 100644 util/testutil.cc create mode 100644 util/testutil.h diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..27a9407 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,8 @@ +# Names should be added to this file like so: +# Name or Organization + +Google Inc. + +# Initial version authors: +Jeffrey Dean +Sanjay Ghemawat diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8e80208 --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The LevelDB Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7569701 --- /dev/null +++ b/Makefile @@ -0,0 +1,134 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +CC = g++ + +# Uncomment one of the following to switch between debug and opt mode +#OPT = -O2 -DNDEBUG +OPT = -g2 + +CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -I./include -std=c++0x $(OPT) + +LDFLAGS=-lpthread + +LIBOBJECTS = \ + ./db/builder.o \ + ./db/db_impl.o \ + ./db/db_iter.o \ + ./db/filename.o \ + ./db/dbformat.o \ + ./db/log_reader.o \ + ./db/log_writer.o \ + ./db/memtable.o \ + ./db/repair.o \ + ./db/table_cache.o \ + ./db/version_edit.o \ + ./db/version_set.o \ + ./db/write_batch.o \ + ./port/port_posix.o \ + ./port/sha1_portable.o \ + ./table/block.o \ + ./table/block_builder.o \ + ./table/format.o \ + ./table/iterator.o \ + ./table/merger.o \ + ./table/table.o \ + ./table/table_builder.o \ + ./table/two_level_iterator.o \ + ./util/arena.o \ + ./util/cache.o \ + ./util/coding.o \ + ./util/comparator.o \ + ./util/crc32c.o \ + ./util/env.o \ + ./util/env_posix.o \ + ./util/hash.o \ + ./util/histogram.o \ + ./util/logging.o \ + ./util/options.o \ + ./util/status.o + +TESTUTIL = ./util/testutil.o +TESTHARNESS = ./util/testharness.o $(TESTUTIL) + +TESTS = \ + arena_test \ + cache_test \ + coding_test \ + corruption_test \ + crc32c_test \ + db_test \ + dbformat_test \ + env_test \ + filename_test \ + log_test \ + sha1_test \ + skiplist_test \ + table_test \ + version_edit_test \ + write_batch_test + +PROGRAMS = db_bench $(TESTS) + +all: $(PROGRAMS) + +check: $(TESTS) + for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done + +clean: + rm -f $(PROGRAMS) */*.o + +db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + +arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + +.cc.o: + $(CC) $(CFLAGS) $< -o $@ + +# TODO(gabor): dependencies for .o files +# TODO(gabor): Build library diff --git a/README b/README new file mode 100644 index 0000000..c97e43c --- /dev/null +++ b/README @@ -0,0 +1,51 @@ +leveldb: A key-value store +Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) + +The code under this directory implements a system for maintaining a +persistent key/value store. + +See doc/index.html for more explanation. +See doc/db_layout.txt for a brief overview of the implementation. + +The public interface is in include/*.h. Callers should not include or +rely on the details of any other header files in this package. Those +internal APIs may be changed without warning. + +Guide to header files: + +include/db.h + Main interface to the DB: Start here + +include/options.h + Control over the behavior of an entire database, and also + control over the behavior of individual reads and writes. + +include/comparator.h + Abstraction for user-specified comparison function. If you want + just bytewise comparison of keys, you can use the default comparator, + but clients can write their own comparator implementations if they + want custom ordering (e.g. to handle different character + encodings, etc.) + +include/iterator.h + Interface for iterating over data. You can get an iterator + from a DB object. + +include/write_batch.h + Interface for atomically applying multiple updates to a database. + +include/slice.h + A simple module for maintaining a pointer and a length into some + other byte array. + +include/status.h + Status is returned from many of the public interfaces and is used + to report success and various kinds of errors. + +include/env.h + Abstraction of the OS environment. A posix implementation of + this interface is in util/env_posix.cc + +include/table.h +include/table_builder.h + Lower-level modules that most clients probably won't use directly diff --git a/TODO b/TODO new file mode 100644 index 0000000..2f848b8 --- /dev/null +++ b/TODO @@ -0,0 +1,14 @@ +ss +- Stats + +db +- Maybe implement DB::BulkDeleteForRange(start_key, end_key) + that would blow away files whose ranges are entirely contained + within [start_key..end_key]? For Chrome, deletion of obsolete + object stores, etc. can be done in the background anyway, so + probably not that important. + +api changes? +- Efficient large value reading and writing + +Faster Get implementation diff --git a/db/builder.cc b/db/builder.cc new file mode 100644 index 0000000..6c8e6b8 --- /dev/null +++ b/db/builder.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" + +namespace leveldb { + +Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + VersionEdit* edit) { + Status s; + meta->file_size = 0; + iter->SeekToFirst(); + + std::string fname = TableFileName(dbname, meta->number); + if (iter->Valid()) { + WritableFile* file; + s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + + TableBuilder* builder = new TableBuilder(options, file); + meta->smallest.DecodeFrom(iter->key()); + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + meta->largest.DecodeFrom(key); + if (ExtractValueType(key) == kTypeLargeValueRef) { + if (iter->value().size() != LargeValueRef::ByteSize()) { + s = Status::Corruption("invalid indirect reference hash value (L0)"); + break; + } + edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()), + meta->number, + iter->key()); + } + builder->Add(key, iter->value()); + } + + // Finish and check for builder errors + if (s.ok()) { + s = builder->Finish(); + if (s.ok()) { + meta->file_size = builder->FileSize(); + assert(meta->file_size > 0); + } + } else { + builder->Abandon(); + } + delete builder; + + // Finish and check for file errors + if (s.ok()) { + s = file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + delete file; + file = NULL; + + if (s.ok()) { + // Verify that the table is usable + Iterator* it = table_cache->NewIterator(ReadOptions(), + meta->number, + meta->file_size); + s = it->status(); + delete it; + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (s.ok() && meta->file_size > 0) { + edit->AddFile(0, meta->number, meta->file_size, + meta->smallest, meta->largest); + } else { + env->DeleteFile(fname); + } + return s; +} + +} diff --git a/db/builder.h b/db/builder.h new file mode 100644 index 0000000..4efcb04 --- /dev/null +++ b/db/builder.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ +#define STORAGE_LEVELDB_DB_BUILDER_H_ + +#include "leveldb/status.h" + +namespace leveldb { + +struct Options; +struct FileMetaData; + +class Env; +class Iterator; +class TableCache; +class VersionEdit; + +// Build a Table file from the contents of *iter. The generated file +// will be named according to meta->number. On success, the rest of +// *meta will be filled with metadata about the generated table, and +// large value refs and the added file information will be added to +// *edit. If no data is present in *iter, meta->file_size will be set +// to zero, and no Table file will be produced. +extern Status BuildTable(const std::string& dbname, + Env* env, + const Options& options, + TableCache* table_cache, + Iterator* iter, + FileMetaData* meta, + VersionEdit* edit); + +} + +#endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/db/corruption_test.cc b/db/corruption_test.cc new file mode 100644 index 0000000..63d8d8b --- /dev/null +++ b/db/corruption_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include +#include +#include +#include +#include "leveldb/cache.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "leveldb/write_batch.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static const int kValueSize = 1000; + +class CorruptionTest { + public: + test::ErrorEnv env_; + Random rnd_; + std::string dbname_; + Cache* tiny_cache_; + Options options_; + DB* db_; + + CorruptionTest() : rnd_(test::RandomSeed()) { + tiny_cache_ = NewLRUCache(100); + options_.env = &env_; + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, options_); + + db_ = NULL; + options_.create_if_missing = true; + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() { + delete db_; + DestroyDB(dbname_, Options()); + delete tiny_cache_; + } + + Status TryReopen(Options* options = NULL) { + delete db_; + db_ = NULL; + Options opt = (options ? *options : options_); + opt.env = &env_; + opt.block_cache = tiny_cache_; + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); + } + + void RepairDB() { + delete db_; + db_ = NULL; + ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); + } + + void Build(int n) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Check(int min_expected, int max_expected) { + int next_expected = 0; + int missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || + !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(key, &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + + fprintf(stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", + min_expected, max_expected, correct, bad_keys, bad_values, missed); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + LargeValueRef large_ref; + FileType type; + std::vector candidates; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type) && + type == filetype) { + candidates.push_back(dbname_ + "/" + filenames[i]); + } + } + ASSERT_TRUE(!candidates.empty()) << filetype; + std::string fname = candidates[rnd_.Uniform(candidates.size())]; + + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + ASSERT_TRUE(false) << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = sbuf.st_size + offset; + } + } + if (offset > sbuf.st_size) { + offset = sbuf.st_size; + } + if (offset + bytes_to_corrupt > sbuf.st_size) { + bytes_to_corrupt = sbuf.st_size - offset; + } + + // Do it + std::string contents; + Status s = ReadFileToString(Env::Default(), fname, &contents); + ASSERT_TRUE(s.ok()) << s.ToString(); + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(Env::Default(), contents, fname); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + int Property(const std::string& name) { + std::string property; + int result; + if (db_->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } +}; + +TEST(CorruptionTest, Recovery) { + Build(100); + Check(100, 100); + Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record + Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + Reopen(); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); +} + +TEST(CorruptionTest, RecoverWriteError) { + env_.writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_.writable_file_error_ = true; + const int num = 3 + (Options().write_buffer_size / kValueSize); + std::string value_storage; + Status s; + for (int i = 0; s.ok() && i < num; i++) { + WriteBatch batch; + batch.Put("a", Value(100, &value_storage)); + s = db_->Write(WriteOptions(), &batch); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_.num_writable_file_errors_, 1); + env_.writable_file_error_ = false; + Reopen(); +} + +TEST(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(1, "", "~"); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); +} + +TEST(CorruptionTest, TableFileIndexData) { + Build(10000); // Enough to build multiple Tables + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(1, "", "~"); + + Corrupt(kTableFile, -2000, 500); + Reopen(); + Check(5000, 9999); +} + +TEST(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST(CorruptionTest, LargeValueRecovery) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + std::string big; + ASSERT_OK(db_->Put(WriteOptions(), + "foo", test::RandomString(&rnd, 100000, &big))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); + + RepairDB(); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); + + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ(big, v); +} + +TEST(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + dbi->TEST_CompactRange(0, "", "~"); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST(CorruptionTest, CompactionInputError) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); + dbi->TEST_CompactRange(0, "", "~"); + ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); +} + +TEST(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.paranoid_checks = true; + options.write_buffer_size = 1048576; + Reopen(&options); + + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + for (int i = 0; i < 10000 && s.ok(); i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + Corrupt(kTableFile, 100, 1); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + dbi->TEST_CompactMemTable(); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/db_bench.cc b/db/db_bench.cc new file mode 100644 index 0000000..849ebfa --- /dev/null +++ b/db/db_bench.cc @@ -0,0 +1,635 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "leveldb/cache.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/write_batch.h" +#include "port/port.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/random.h" +#include "util/testutil.h" + +// Comma-separated list of operations to run in the specified order +// Actual benchmarks: +// fillseq -- write N values in sequential key order in async mode +// fillrandom -- write N values in random key order in async mode +// overwrite -- overwrite N values in random key order in async mode +// fillsync -- write N/100 values in random key order in sync mode +// fill100K -- write N/1000 100K values in random order in async mode +// readseq -- read N values sequentially +// readreverse -- read N values in reverse order +// readrandom -- read N values in random order +// crc32c -- repeated crc32c of 4K of data +// sha1 -- repeated SHA1 computation over 4K of data +// Meta operations: +// compact -- Compact the entire DB +// stats -- Print DB stats +// heapprofile -- Dump a heap profile (if supported by this port) +static const char* FLAGS_benchmarks = + "fillseq," + "fillsync," + "fillrandom," + "overwrite," + "readrandom," + "readrandom," // Extra run to allow previous compactions to quiesce + "readseq," + "readreverse," + "compact," + "readrandom," + "readseq," + "readreverse," + "fill100K," + "crc32c," + "sha1," + "snappycomp," + "snappyuncomp," + ; + +// Number of key/values to place in database +static int FLAGS_num = 1000000; + +// Size of each value +static int FLAGS_value_size = 100; + +// Arrange to generate values that shrink to this fraction of +// their original size after compression +static double FLAGS_compression_ratio = 0.5; + +// Print histogram of operation timings +static bool FLAGS_histogram = false; + +// Number of bytes to buffer in memtable before compacting +// (initialized to default value by "main") +static int FLAGS_write_buffer_size = 0; + +// Number of bytes to use as a cache of uncompressed data. +// Negative means use default settings. +static int FLAGS_cache_size = -1; + +namespace leveldb { + +// Helper for quickly generating random data. +namespace { +class RandomGenerator { + private: + std::string data_; + int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < 1048576) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +static Slice TrimSpace(Slice s) { + int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); +} + +} + +class Benchmark { + private: + Cache* cache_; + DB* db_; + int num_; + int heap_counter_; + double start_; + double last_op_finish_; + int64_t bytes_; + std::string message_; + std::string post_message_; + Histogram hist_; + RandomGenerator gen_; + Random rand_; + + // State kept for progress messages + int done_; + int next_report_; // When to report next + + void PrintHeader() { + const int kKeySize = 16; + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", kKeySize); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %d\n", num_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast(kKeySize + FLAGS_value_size) * num_) + / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) + / 1048576.0)); + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + + // See if snappy is working by attempting to compress a compressible string + const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; + std::string compressed; + if (!port::Snappy_Compress(text, sizeof(text), &compressed)) { + fprintf(stdout, "WARNING: Snappy compression is not enabled\n"); + } else if (compressed.size() >= sizeof(text)) { + fprintf(stdout, "WARNING: Snappy compression is not effective\n"); + } + } + + void PrintEnvironment() { + fprintf(stderr, "LevelDB: version %d.%d\n", + kMajorVersion, kMinorVersion); + +#if defined(__linux) + time_t now = time(NULL); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != NULL) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != NULL) { + const char* sep = strchr(line, ':'); + if (sep == NULL) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + + void Start() { + start_ = Env::Default()->NowMicros() * 1e-6; + bytes_ = 0; + message_.clear(); + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + next_report_ = 100; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = Env::Default()->NowMicros() * 1e-6; + double micros = (now - last_op_finish_) * 1e6; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stderr, "... finished %d ops%30s\r", done_, ""); + fflush(stderr); + } + } + + void Stop(const Slice& name) { + double finish = Env::Default()->NowMicros() * 1e-6; + + // Pretend at least one op was done in case we are running a benchmark + // that does nto call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + if (bytes_ > 0) { + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / (finish - start_)); + if (!message_.empty()) { + message_ = std::string(rate) + " " + message_; + } else { + message_ = rate; + } + } + + fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", + name.ToString().c_str(), + (finish - start_) * 1e6 / done_, + (message_.empty() ? "" : " "), + message_.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + + if (!post_message_.empty()) { + fprintf(stdout, "\n%s\n", post_message_.c_str()); + post_message_.clear(); + } + } + + public: + enum Order { + SEQUENTIAL, + RANDOM + }; + enum DBState { + FRESH, + EXISTING + }; + + Benchmark() + : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), + db_(NULL), + num_(FLAGS_num), + heap_counter_(0), + bytes_(0), + rand_(301) { + std::vector files; + Env::Default()->GetChildren("/tmp/dbbench", &files); + for (int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); + } + } + DestroyDB("/tmp/dbbench", Options()); + } + + ~Benchmark() { + delete db_; + delete cache_; + } + + void Run() { + PrintHeader(); + Open(); + + const char* benchmarks = FLAGS_benchmarks; + while (benchmarks != NULL) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == NULL) { + name = benchmarks; + benchmarks = NULL; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + Start(); + + WriteOptions write_options; + bool known = true; + if (name == Slice("fillseq")) { + Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); + } else if (name == Slice("fillbatch")) { + Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000); + } else if (name == Slice("fillrandom")) { + Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1); + } else if (name == Slice("overwrite")) { + Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1); + } else if (name == Slice("fillsync")) { + write_options.sync = true; + Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); + } else if (name == Slice("fill100K")) { + Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); + } else if (name == Slice("readseq")) { + ReadSequential(); + } else if (name == Slice("readreverse")) { + ReadReverse(); + } else if (name == Slice("readrandom")) { + ReadRandom(); + } else if (name == Slice("readrandomsmall")) { + int n = num_; + num_ /= 1000; + ReadRandom(); + num_ = n; + } else if (name == Slice("compact")) { + Compact(); + } else if (name == Slice("crc32c")) { + Crc32c(4096, "(4K per op)"); + } else if (name == Slice("sha1")) { + SHA1(4096, "(4K per op)"); + } else if (name == Slice("snappycomp")) { + SnappyCompress(); + } else if (name == Slice("snappyuncomp")) { + SnappyUncompress(); + } else if (name == Slice("heapprofile")) { + HeapProfile(); + } else if (name == Slice("stats")) { + PrintStats(); + } else { + known = false; + if (name != Slice()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + } + } + if (known) { + Stop(name); + } + } + } + + private: + void Crc32c(int size, const char* label) { + // Checksum about 500MB of data total + std::string data(size, 'x'); + int64_t bytes = 0; + uint32_t crc = 0; + while (bytes < 500 * 1048576) { + crc = crc32c::Value(data.data(), size); + FinishedSingleOp(); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); + + bytes_ = bytes; + message_ = label; + } + + void SHA1(int size, const char* label) { + // SHA1 about 100MB of data total + std::string data(size, 'x'); + int64_t bytes = 0; + char sha1[20]; + while (bytes < 100 * 1048576) { + port::SHA1_Hash(data.data(), size, sha1); + FinishedSingleOp(); + bytes += size; + } + + // Print so result is not dead + fprintf(stderr, "... sha1=%02x...\r", static_cast(sha1[0])); + + bytes_ = bytes; + message_ = label; + } + + void SnappyCompress() { + Slice input = gen_.Generate(Options().block_size); + int64_t bytes = 0; + int64_t produced = 0; + bool ok = true; + std::string compressed; + while (ok && bytes < 1024 * 1048576) { // Compress 1G + ok = port::Snappy_Compress(input.data(), input.size(), &compressed); + produced += compressed.size(); + bytes += input.size(); + FinishedSingleOp(); + } + + if (!ok) { + message_ = "(snappy failure)"; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "(output: %.1f%%)", + (produced * 100.0) / bytes); + message_ = buf; + bytes_ = bytes; + } + } + + void SnappyUncompress() { + Slice input = gen_.Generate(Options().block_size); + std::string compressed; + bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); + int64_t bytes = 0; + std::string uncompressed; + while (ok && bytes < 1024 * 1048576) { // Compress 1G + ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), + &uncompressed); + bytes += uncompressed.size(); + FinishedSingleOp(); + } + + if (!ok) { + message_ = "(snappy failure)"; + } else { + bytes_ = bytes; + } + } + + void Open() { + assert(db_ == NULL); + Options options; + options.create_if_missing = true; + options.block_cache = cache_; + options.write_buffer_size = FLAGS_write_buffer_size; + Status s = DB::Open(options, "/tmp/dbbench", &db_); + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void Write(const WriteOptions& options, Order order, DBState state, + int num_entries, int value_size, int entries_per_batch) { + if (state == FRESH) { + delete db_; + db_ = NULL; + DestroyDB("/tmp/dbbench", Options()); + Open(); + Start(); // Do not count time taken to destroy/open + } + + if (num_entries != num_) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%d ops)", num_entries); + message_ = msg; + } + + WriteBatch batch; + Status s; + std::string val; + for (int i = 0; i < num_entries; i += entries_per_batch) { + batch.Clear(); + for (int j = 0; j < entries_per_batch; j++) { + const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num); + char key[100]; + snprintf(key, sizeof(key), "%016d", k); + batch.Put(key, gen_.Generate(value_size)); + bytes_ += value_size + strlen(key); + FinishedSingleOp(); + } + s = db_->Write(options, &batch); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + } + } + + void ReadSequential() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadReverse() { + Iterator* iter = db_->NewIterator(ReadOptions()); + int i = 0; + for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { + bytes_ += iter->key().size() + iter->value().size(); + FinishedSingleOp(); + ++i; + } + delete iter; + } + + void ReadRandom() { + ReadOptions options; + std::string value; + for (int i = 0; i < num_; i++) { + char key[100]; + const int k = rand_.Next() % FLAGS_num; + snprintf(key, sizeof(key), "%016d", k); + db_->Get(options, key, &value); + FinishedSingleOp(); + } + } + + void Compact() { + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + std::string property; + char name[100]; + snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); + if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbi->TEST_CompactRange(level, "", "~"); + } + } + + void PrintStats() { + std::string stats; + if (!db_->GetProperty("leveldb.stats", &stats)) { + message_ = "(failed)"; + } else { + post_message_ = stats; + } + } + + static void WriteToFile(void* arg, const char* buf, int n) { + reinterpret_cast(arg)->Append(Slice(buf, n)); + } + + void HeapProfile() { + char fname[100]; + snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); + WritableFile* file; + Status s = Env::Default()->NewWritableFile(fname, &file); + if (!s.ok()) { + message_ = s.ToString(); + return; + } + bool ok = port::GetHeapProfile(WriteToFile, file); + delete file; + if (!ok) { + message_ = "not supported"; + Env::Default()->DeleteFile(fname); + } + } +}; + +} + +int main(int argc, char** argv) { + FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; + for (int i = 1; i < argc; i++) { + double d; + int n; + char junk; + if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { + FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); + } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { + FLAGS_compression_ratio = d; + } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_histogram = n; + } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { + FLAGS_num = n; + } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { + FLAGS_value_size = n; + } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { + FLAGS_cache_size = n; + } else { + fprintf(stderr, "Invalid flag '%s'\n", argv[i]); + exit(1); + } + } + + leveldb::Benchmark benchmark; + benchmark.Run(); + return 0; +} diff --git a/db/db_impl.cc b/db/db_impl.cc new file mode 100644 index 0000000..d012236 --- /dev/null +++ b/db/db_impl.cc @@ -0,0 +1,1345 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" + +#include +#include +#include +#include +#include +#include +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/status.h" +#include "leveldb/table.h" +#include "leveldb/table_builder.h" +#include "port/port.h" +#include "table/block.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/mutexlock.h" + +namespace leveldb { + +struct DBImpl::CompactionState { + Compaction* const compaction; + + // Sequence numbers < smallest_snapshot are not significant since we + // will never have to service a snapshot below smallest_snapshot. + // Therefore if we have seen a sequence number S <= smallest_snapshot, + // we can drop all entries for the same key with sequence numbers < S. + SequenceNumber smallest_snapshot; + + // Files produced by compaction + struct Output { + uint64_t number; + uint64_t file_size; + InternalKey smallest, largest; + }; + std::vector outputs; + + // State kept for output being generated + WritableFile* outfile; + TableBuilder* builder; + + uint64_t total_bytes; + + Output* current_output() { return &outputs[outputs.size()-1]; } + + explicit CompactionState(Compaction* c) + : compaction(c), + outfile(NULL), + builder(NULL), + total_bytes(0) { + } +}; + +namespace { +class NullWritableFile : public WritableFile { + public: + virtual Status Append(const Slice& data) { return Status::OK(); } + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } +}; +} + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (*ptr > maxvalue) *ptr = maxvalue; + if (*ptr < minvalue) *ptr = minvalue; +} +Options SanitizeOptions(const std::string& dbname, + const InternalKeyComparator* icmp, + const Options& src) { + Options result = src; + result.comparator = icmp; + ClipToRange(&result.max_open_files, 20, 50000); + ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); + ClipToRange(&result.large_value_threshold, 16<<10, 1<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); + if (result.info_log == NULL) { + // Open a log file in the same directory as the db + src.env->CreateDir(dbname); // In case it does not exist + src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); + Status s = src.env->NewWritableFile(InfoLogFileName(dbname), + &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = new NullWritableFile; + } + } + if (result.block_cache == NULL) { + result.block_cache = NewLRUCache(8 << 20); + } + return result; +} + +DBImpl::DBImpl(const Options& options, const std::string& dbname) + : env_(options.env), + internal_comparator_(options.comparator), + options_(SanitizeOptions(dbname, &internal_comparator_, options)), + owns_info_log_(options_.info_log != options.info_log), + owns_cache_(options_.block_cache != options.block_cache), + dbname_(dbname), + db_lock_(NULL), + shutting_down_(NULL), + bg_cv_(&mutex_), + compacting_cv_(&mutex_), + mem_(new MemTable(internal_comparator_)), + imm_(NULL), + logfile_(NULL), + log_(NULL), + bg_compaction_scheduled_(false), + compacting_(false) { + has_imm_.Release_Store(NULL); + + // Reserve ten files or so for other uses and give the rest to TableCache. + const int table_cache_size = options.max_open_files - 10; + table_cache_ = new TableCache(dbname_, &options_, table_cache_size); + + versions_ = new VersionSet(dbname_, &options_, table_cache_, + &internal_comparator_); +} + +DBImpl::~DBImpl() { + // Wait for background work to finish + mutex_.Lock(); + shutting_down_.Release_Store(this); // Any non-NULL value is ok + if (bg_compaction_scheduled_) { + while (bg_compaction_scheduled_) { + bg_cv_.Wait(); + } + } + mutex_.Unlock(); + + if (db_lock_ != NULL) { + env_->UnlockFile(db_lock_); + } + + delete versions_; + delete mem_; + delete imm_; + delete log_; + delete logfile_; + delete table_cache_; + + if (owns_info_log_) { + delete options_.info_log; + } + if (owns_cache_) { + delete options_.block_cache; + } +} + +Status DBImpl::NewDB() { + VersionEdit new_db; + new_db.SetComparatorName(user_comparator()->Name()); + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + WritableFile* file; + Status s = env_->NewWritableFile(manifest, &file); + if (!s.ok()) { + return s; + } + { + log::Writer log(file); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = file->Close(); + } + } + delete file; + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(manifest); + } + return s; +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || options_.paranoid_checks) { + // No change needed + } else { + Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str()); + *s = Status::OK(); + } +} + +void DBImpl::DeleteObsoleteFiles() { + // Make a set of all of the live files + std::set live = pending_outputs_; + versions_->AddLiveFiles(&live); + + versions_->CleanupLargeValueRefs(live); + + std::vector filenames; + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number == versions_->LogNumber()) || + (number == versions_->PrevLogNumber())); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (in case there is a race that allows other incarnations) + keep = (number >= versions_->ManifestFileNumber()); + break; + case kTableFile: + keep = (live.find(number) != live.end()); + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live" + keep = (live.find(number) != live.end()); + break; + case kLargeValueFile: + keep = versions_->LargeValueIsLive(large_ref); + break; + case kCurrentFile: + case kDBLockFile: + case kInfoLogFile: + keep = true; + break; + } + + if (!keep) { + if (type == kTableFile) { + table_cache_->Evict(number); + } + Log(env_, options_.info_log, "Delete type=%d #%lld\n", + int(type), + static_cast(number)); + env_->DeleteFile(dbname_ + "/" + filenames[i]); + } + } + } +} + +Status DBImpl::Recover(VersionEdit* edit) { + mutex_.AssertHeld(); + + // Ignore error from CreateDir since the creation of the DB is + // committed only when the descriptor is created, and this directory + // may already exist from a previous failed creation attempt. + env_->CreateDir(dbname_); + assert(db_lock_ == NULL); + Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + if (!env_->FileExists(CurrentFileName(dbname_))) { + if (options_.create_if_missing) { + s = NewDB(); + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + dbname_, "does not exist (create_if_missing is false)"); + } + } else { + if (options_.error_if_exists) { + return Status::InvalidArgument( + dbname_, "exists (error_if_exists is true)"); + } + } + + s = versions_->Recover(); + if (s.ok()) { + // Recover from the log files named in the descriptor + SequenceNumber max_sequence(0); + if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log + s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence); + } + if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state + s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence); + } + if (s.ok()) { + if (versions_->LastSequence() < max_sequence) { + versions_->SetLastSequence(max_sequence); + } + } + } + + return s; +} + +Status DBImpl::RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + WritableFile* info_log; + const char* fname; + Status* status; // NULL if options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) { + Log(env, info_log, "%s%s: dropping %d bytes; %s", + (this->status == NULL ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != NULL && this->status->ok()) *this->status = s; + } + }; + + mutex_.AssertHeld(); + + // Open the log file + std::string fname = LogFileName(dbname_, log_number); + SequentialFile* file; + Status status = env_->NewSequentialFile(fname, &file); + if (!status.ok()) { + MaybeIgnoreError(&status); + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks ? &status : NULL); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(file, &reporter, true/*checksum*/); + Log(env_, options_.info_log, "Recovering log #%llu", + (unsigned long long) log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable* mem = NULL; + while (reader.ReadRecord(&record, &scratch) && + status.ok()) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + + if (mem == NULL) { + mem = new MemTable(internal_comparator_); + } + status = WriteBatchInternal::InsertInto(&batch, mem); + MaybeIgnoreError(&status); + if (!status.ok()) { + break; + } + const SequenceNumber last_seq = + WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { + status = WriteLevel0Table(mem, edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + break; + } + delete mem; + mem = NULL; + } + } + + if (status.ok() && mem != NULL) { + status = WriteLevel0Table(mem, edit); + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + } + + delete mem; + delete file; + return status; +} + +Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + pending_outputs_.insert(meta.number); + Iterator* iter = mem->NewIterator(); + Log(env_, options_.info_log, "Level-0 table #%llu: started", + (unsigned long long) meta.number); + + Status s; + { + mutex_.Unlock(); + s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit); + mutex_.Lock(); + } + + Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s", + (unsigned long long) meta.number, + (unsigned long long) meta.file_size, + s.ToString().c_str()); + delete iter; + pending_outputs_.erase(meta.number); + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + stats_[0].Add(stats); + return s; +} + +Status DBImpl::CompactMemTable() { + mutex_.AssertHeld(); + assert(imm_ != NULL); + assert(compacting_); + + // Save the contents of the memtable as a new Table + VersionEdit edit; + Status s = WriteLevel0Table(imm_, &edit); + + // Replace immutable memtable with the generated Table + if (s.ok()) { + edit.SetPrevLogNumber(0); + s = versions_->LogAndApply(&edit, imm_); + } + + if (s.ok()) { + // Commit to the new state + imm_ = NULL; + has_imm_.Release_Store(NULL); + DeleteObsoleteFiles(); + } + + compacting_cv_.SignalAll(); // Wake up waiter even if there was an error + return s; +} + +void DBImpl::TEST_CompactRange( + int level, + const std::string& begin, + const std::string& end) { + MutexLock l(&mutex_); + while (compacting_) { + compacting_cv_.Wait(); + } + Compaction* c = versions_->CompactRange( + level, + InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek), + InternalKey(end, 0, static_cast(0))); + + if (c != NULL) { + CompactionState* compact = new CompactionState(c); + DoCompactionWork(compact); // Ignore error in test compaction + CleanupCompaction(compact); + } + + // Start any background compaction that may have been delayed by this thread + MaybeScheduleCompaction(); +} + +Status DBImpl::TEST_CompactMemTable() { + MutexLock l(&mutex_); + Status s = MakeRoomForWrite(true /* force compaction */); + if (s.ok()) { + // Wait until the compaction completes + while (imm_ != NULL && bg_error_.ok()) { + compacting_cv_.Wait(); + } + if (imm_ != NULL) { + s = bg_error_; + } + } + return s; +} + +void DBImpl::MaybeScheduleCompaction() { + mutex_.AssertHeld(); + if (bg_compaction_scheduled_) { + // Already scheduled + } else if (compacting_) { + // Some other thread is running a compaction. Do not conflict with it. + } else if (shutting_down_.Acquire_Load()) { + // DB is being deleted; no more background compactions + } else if (imm_ == NULL && !versions_->NeedsCompaction()) { + // No work to be done + } else { + bg_compaction_scheduled_ = true; + env_->Schedule(&DBImpl::BGWork, this); + } +} + +void DBImpl::BGWork(void* db) { + reinterpret_cast(db)->BackgroundCall(); +} + +void DBImpl::BackgroundCall() { + MutexLock l(&mutex_); + assert(bg_compaction_scheduled_); + if (!shutting_down_.Acquire_Load() && + !compacting_) { + BackgroundCompaction(); + } + bg_compaction_scheduled_ = false; + bg_cv_.SignalAll(); + + // Previous compaction may have produced too many files in a level, + // so reschedule another compaction if needed. + MaybeScheduleCompaction(); +} + +void DBImpl::BackgroundCompaction() { + mutex_.AssertHeld(); + assert(!compacting_); + + if (imm_ != NULL) { + compacting_ = true; + CompactMemTable(); + compacting_ = false; + compacting_cv_.SignalAll(); + return; + } + + Compaction* c = versions_->PickCompaction(); + if (c == NULL) { + // Nothing to do + return; + } + + Status status; + if (c->IsTrivialMove()) { + // Move file to next level + assert(c->num_input_files(0) == 1); + FileMetaData* f = c->input(0, 0); + c->edit()->DeleteFile(c->level(), f->number); + c->edit()->AddFile(c->level() + 1, f->number, f->file_size, + f->smallest, f->largest); + status = versions_->LogAndApply(c->edit(), NULL); + Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", + static_cast(f->number), + c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str()); + } else { + CompactionState* compact = new CompactionState(c); + status = DoCompactionWork(compact); + CleanupCompaction(compact); + } + delete c; + + if (status.ok()) { + // Done + } else if (shutting_down_.Acquire_Load()) { + // Ignore compaction errors found during shutting down + } else { + Log(env_, options_.info_log, + "Compaction error: %s", status.ToString().c_str()); + if (options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } +} + +void DBImpl::CleanupCompaction(CompactionState* compact) { + mutex_.AssertHeld(); + if (compact->builder != NULL) { + // May happen if we get a shutdown call in the middle of compaction + compact->builder->Abandon(); + delete compact->builder; + } else { + assert(compact->outfile == NULL); + } + delete compact->outfile; + for (int i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + pending_outputs_.erase(out.number); + } + delete compact; +} + +Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { + assert(compact != NULL); + assert(compact->builder == NULL); + uint64_t file_number; + { + mutex_.Lock(); + file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + CompactionState::Output out; + out.number = file_number; + out.smallest.Clear(); + out.largest.Clear(); + compact->outputs.push_back(out); + mutex_.Unlock(); + } + + // Make the output file + std::string fname = TableFileName(dbname_, file_number); + Status s = env_->NewWritableFile(fname, &compact->outfile); + if (s.ok()) { + compact->builder = new TableBuilder(options_, compact->outfile); + } + return s; +} + +Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, + Iterator* input) { + assert(compact != NULL); + assert(compact->outfile != NULL); + assert(compact->builder != NULL); + + const uint64_t output_number = compact->current_output()->number; + assert(output_number != 0); + + // Check for iterator errors + Status s = input->status(); + const uint64_t current_entries = compact->builder->NumEntries(); + if (s.ok()) { + s = compact->builder->Finish(); + } else { + compact->builder->Abandon(); + } + const uint64_t current_bytes = compact->builder->FileSize(); + compact->current_output()->file_size = current_bytes; + compact->total_bytes += current_bytes; + delete compact->builder; + compact->builder = NULL; + + // Finish and check for file errors + if (s.ok()) { + s = compact->outfile->Sync(); + } + if (s.ok()) { + s = compact->outfile->Close(); + } + delete compact->outfile; + compact->outfile = NULL; + + if (s.ok() && current_entries > 0) { + // Verify that the table is usable + Iterator* iter = table_cache_->NewIterator(ReadOptions(), + output_number, + current_bytes); + s = iter->status(); + delete iter; + if (s.ok()) { + Log(env_, options_.info_log, + "Generated table #%llu: %lld keys, %lld bytes", + (unsigned long long) output_number, + (unsigned long long) current_entries, + (unsigned long long) current_bytes); + } + } + return s; +} + + +Status DBImpl::InstallCompactionResults(CompactionState* compact) { + mutex_.AssertHeld(); + Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1, + static_cast(compact->total_bytes)); + + // Add compaction outputs + compact->compaction->AddInputDeletions(compact->compaction->edit()); + const int level = compact->compaction->level(); + for (int i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + compact->compaction->edit()->AddFile( + level + 1, + out.number, out.file_size, out.smallest, out.largest); + pending_outputs_.erase(out.number); + } + compact->outputs.clear(); + + Status s = versions_->LogAndApply(compact->compaction->edit(), NULL); + if (s.ok()) { + compact->compaction->ReleaseInputs(); + DeleteObsoleteFiles(); + } else { + // Discard any files we may have created during this failed compaction + for (int i = 0; i < compact->outputs.size(); i++) { + env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); + } + } + return s; +} + +Status DBImpl::DoCompactionWork(CompactionState* compact) { + const uint64_t start_micros = env_->NowMicros(); + int64_t imm_micros = 0; // Micros spent doing imm_ compactions + + Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1); + + assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); + assert(compact->builder == NULL); + assert(compact->outfile == NULL); + if (snapshots_.empty()) { + compact->smallest_snapshot = versions_->LastSequence(); + } else { + compact->smallest_snapshot = snapshots_.oldest()->number_; + } + + // Release mutex while we're actually doing the compaction work + compacting_ = true; + mutex_.Unlock(); + + Iterator* input = versions_->MakeInputIterator(compact->compaction); + input->SeekToFirst(); + Status status; + ParsedInternalKey ikey; + std::string current_user_key; + bool has_current_user_key = false; + SequenceNumber last_sequence_for_key = kMaxSequenceNumber; + for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { + // Prioritize immutable compaction work + if (has_imm_.NoBarrier_Load() != NULL) { + const uint64_t imm_start = env_->NowMicros(); + mutex_.Lock(); + if (imm_ != NULL) { + CompactMemTable(); + compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + } + mutex_.Unlock(); + imm_micros += (env_->NowMicros() - imm_start); + } + + Slice key = input->key(); + InternalKey tmp_internal_key; + tmp_internal_key.DecodeFrom(key); + if (compact->compaction->ShouldStopBefore(tmp_internal_key) && + compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. + bool drop = false; + if (!ParseInternalKey(key, &ikey)) { + // Do not hide error keys + current_user_key.clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + } else { + if (!has_current_user_key || + user_comparator()->Compare(ikey.user_key, + Slice(current_user_key)) != 0) { + // First occurrence of this user key + current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + } + + if (last_sequence_for_key <= compact->smallest_snapshot) { + // Hidden by an newer entry for same user key + drop = true; // (A) + } else if (ikey.type == kTypeDeletion && + ikey.sequence <= compact->smallest_snapshot && + compact->compaction->IsBaseLevelForKey(ikey.user_key)) { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + } + + last_sequence_for_key = ikey.sequence; + } +#if 0 + Log(env_, options_.info_log, + " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " + "%d smallest_snapshot: %d", + ikey.user_key.ToString().c_str(), + (int)ikey.sequence, ikey.type, kTypeLargeValueRef, drop, + compact->compaction->IsBaseLevelForKey(ikey.user_key), + (int)last_sequence_for_key, (int)compact->smallest_snapshot); +#endif + + if (!drop) { + // Open output file if necessary + if (compact->builder == NULL) { + status = OpenCompactionOutputFile(compact); + if (!status.ok()) { + break; + } + } + if (compact->builder->NumEntries() == 0) { + compact->current_output()->smallest.DecodeFrom(key); + } + compact->current_output()->largest.DecodeFrom(key); + + if (ikey.type == kTypeLargeValueRef) { + if (input->value().size() != LargeValueRef::ByteSize()) { + if (options_.paranoid_checks) { + status = Status::Corruption("invalid large value ref"); + break; + } else { + Log(env_, options_.info_log, + "compaction found invalid large value ref"); + } + } else { + compact->compaction->edit()->AddLargeValueRef( + LargeValueRef::FromRef(input->value()), + compact->current_output()->number, + input->key()); + compact->builder->Add(key, input->value()); + } + } else { + compact->builder->Add(key, input->value()); + } + + // Close output file if it is big enough + if (compact->builder->FileSize() >= + compact->compaction->MaxOutputFileSize()) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + } + + input->Next(); + } + + if (status.ok() && shutting_down_.Acquire_Load()) { + status = Status::IOError("Deleting DB during compaction"); + } + if (status.ok() && compact->builder != NULL) { + status = FinishCompactionOutputFile(compact, input); + } + if (status.ok()) { + status = input->status(); + } + delete input; + input = NULL; + + CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros - imm_micros; + for (int which = 0; which < 2; which++) { + for (int i = 0; i < compact->compaction->num_input_files(which); i++) { + stats.bytes_read += compact->compaction->input(which, i)->file_size; + } + } + for (int i = 0; i < compact->outputs.size(); i++) { + stats.bytes_written += compact->outputs[i].file_size; + } + + mutex_.Lock(); + stats_[compact->compaction->level() + 1].Add(stats); + + if (status.ok()) { + status = InstallCompactionResults(compact); + } + compacting_ = false; + compacting_cv_.SignalAll(); + return status; +} + +Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, + SequenceNumber* latest_snapshot) { + mutex_.Lock(); + *latest_snapshot = versions_->LastSequence(); + + // Collect together all needed child iterators + std::vector list; + list.push_back(mem_->NewIterator()); + if (imm_ != NULL) { + list.push_back(imm_->NewIterator()); + } + versions_->current()->AddIterators(options, &list); + Iterator* internal_iter = + NewMergingIterator(&internal_comparator_, &list[0], list.size()); + versions_->current()->Ref(); + internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current()); + + mutex_.Unlock(); + return internal_iter; +} + +Iterator* DBImpl::TEST_NewInternalIterator() { + SequenceNumber ignored; + return NewInternalIterator(ReadOptions(), &ignored); +} + +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { + MutexLock l(&mutex_); + return versions_->MaxNextLevelOverlappingBytes(); +} + +Status DBImpl::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + // TODO(opt): faster implementation + Iterator* iter = NewIterator(options); + iter->Seek(key); + bool found = false; + if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) { + Slice v = iter->value(); + value->assign(v.data(), v.size()); + found = true; + } + // Non-OK iterator status trumps everything else + Status result = iter->status(); + if (result.ok() && !found) { + result = Status::NotFound(Slice()); // Use an empty error message for speed + } + delete iter; + return result; +} + +Iterator* DBImpl::NewIterator(const ReadOptions& options) { + SequenceNumber latest_snapshot; + Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + SequenceNumber sequence = + (options.snapshot ? options.snapshot->number_ : latest_snapshot); + return NewDBIterator(&dbname_, env_, + user_comparator(), internal_iter, sequence); +} + +void DBImpl::Unref(void* arg1, void* arg2) { + DBImpl* impl = reinterpret_cast(arg1); + Version* v = reinterpret_cast(arg2); + MutexLock l(&impl->mutex_); + v->Unref(); +} + +const Snapshot* DBImpl::GetSnapshot() { + MutexLock l(&mutex_); + return snapshots_.New(versions_->LastSequence()); +} + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + MutexLock l(&mutex_); + snapshots_.Delete(s); +} + +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { + return DB::Put(o, key, val); +} + +Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + return DB::Delete(options, key); +} + +Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { + Status status; + + WriteBatch* final = NULL; + { + MutexLock l(&mutex_); + status = MakeRoomForWrite(false); // May temporarily release lock and wait + + uint64_t last_sequence = versions_->LastSequence(); + if (status.ok()) { + status = HandleLargeValues(last_sequence + 1, updates, &final); + } + if (status.ok()) { + WriteBatchInternal::SetSequence(final, last_sequence + 1); + last_sequence += WriteBatchInternal::Count(final); + versions_->SetLastSequence(last_sequence); + + // Add to log and apply to memtable + status = log_->AddRecord(WriteBatchInternal::Contents(final)); + if (status.ok() && options.sync) { + status = logfile_->Sync(); + } + if (status.ok()) { + status = WriteBatchInternal::InsertInto(final, mem_); + } + } + + if (options.post_write_snapshot != NULL) { + *options.post_write_snapshot = + status.ok() ? snapshots_.New(last_sequence) : NULL; + } + } + if (final != updates) { + delete final; + } + + return status; +} + +Status DBImpl::MakeRoomForWrite(bool force) { + mutex_.AssertHeld(); + Status s; + while (true) { + if (!bg_error_.ok()) { + // Yield previous error + s = bg_error_; + break; + } else if (!force && + (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { + // There is room in current memtable + break; + } else if (imm_ != NULL) { + // We have filled up the current memtable, but the previous + // one is still being compacted, so we wait. + compacting_cv_.Wait(); + } else { + // Attempt to switch to a new memtable and trigger compaction of old + assert(versions_->PrevLogNumber() == 0); + uint64_t new_log_number = versions_->NewFileNumber(); + WritableFile* lfile = NULL; + s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); + if (!s.ok()) { + break; + } + VersionEdit edit; + edit.SetPrevLogNumber(versions_->LogNumber()); + edit.SetLogNumber(new_log_number); + s = versions_->LogAndApply(&edit, NULL); + if (!s.ok()) { + delete lfile; + env_->DeleteFile(LogFileName(dbname_, new_log_number)); + break; + } + delete log_; + delete logfile_; + logfile_ = lfile; + log_ = new log::Writer(lfile); + imm_ = mem_; + has_imm_.Release_Store(imm_); + mem_ = new MemTable(internal_comparator_); + force = false; // Do not force another compaction if have room + MaybeScheduleCompaction(); + } + } + return s; +} + +bool DBImpl::HasLargeValues(const WriteBatch& batch) const { + if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) { + for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) { + if (it.op() == kTypeValue && + it.value().size() >= options_.large_value_threshold) { + return true; + } + } + } + return false; +} + +// Given "raw_value", determines the appropriate compression format to use +// and stores the data that should be written to the large value file in +// "*file_bytes", and sets "*ref" to the appropriate large value reference. +// May use "*scratch" as backing store for "*file_bytes". +void DBImpl::MaybeCompressLargeValue( + const Slice& raw_value, + Slice* file_bytes, + std::string* scratch, + LargeValueRef* ref) { + switch (options_.compression) { + case kSnappyCompression: { + if (port::Snappy_Compress(raw_value.data(), raw_value.size(), scratch) && + (scratch->size() < (raw_value.size() / 8) * 7)) { + *file_bytes = *scratch; + *ref = LargeValueRef::Make(raw_value, kSnappyCompression); + return; + } + + // Less than 12.5% compression: just leave as uncompressed data + break; + } + case kNoCompression: + // Use default code outside of switch + break; + } + // Store as uncompressed data + *file_bytes = raw_value; + *ref = LargeValueRef::Make(raw_value, kNoCompression); +} + +Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq, + WriteBatch* updates, + WriteBatch** final) { + if (!HasLargeValues(*updates)) { + // Fast path: no large values found + *final = updates; + } else { + // Copy *updates to a new WriteBatch, replacing the references to + *final = new WriteBatch; + SequenceNumber seq = assigned_seq; + for (WriteBatchInternal::Iterator it(*updates); !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeValue: + if (it.value().size() < options_.large_value_threshold) { + (*final)->Put(it.key(), it.value()); + } else { + std::string scratch; + Slice file_bytes; + LargeValueRef large_ref; + MaybeCompressLargeValue( + it.value(), &file_bytes, &scratch, &large_ref); + InternalKey ikey(it.key(), seq, kTypeLargeValueRef); + if (versions_->RegisterLargeValueRef( + large_ref, versions_->LogNumber(), ikey)) { + // TODO(opt): avoid holding the lock here (but be careful about + // another thread doing a Write and switching logs or + // having us get a different "assigned_seq" value). + + uint64_t tmp_number = versions_->NewFileNumber(); + pending_outputs_.insert(tmp_number); + std::string tmp = TempFileName(dbname_, tmp_number); + WritableFile* file; + Status s = env_->NewWritableFile(tmp, &file); + if (!s.ok()) { + return s; // Caller will delete *final + } + + file->Append(file_bytes); + + s = file->Close(); + delete file; + + if (s.ok()) { + const std::string fname = + LargeValueFileName(dbname_, large_ref); + s = env_->RenameFile(tmp, fname); + } else { + Log(env_, options_.info_log, "Write large value: %s", + s.ToString().c_str()); + } + pending_outputs_.erase(tmp_number); + + if (!s.ok()) { + env_->DeleteFile(tmp); // Cleanup; intentionally ignoring error + return s; // Caller will delete *final + } + } + + // Put an indirect reference in the write batch in place + // of large value + WriteBatchInternal::PutLargeValueRef(*final, it.key(), large_ref); + } + break; + case kTypeLargeValueRef: + return Status::Corruption("Corrupted write batch"); + break; + case kTypeDeletion: + (*final)->Delete(it.key()); + break; + } + seq = seq + 1; + } + } + return Status::OK(); +} + +bool DBImpl::GetProperty(const Slice& property, std::string* value) { + value->clear(); + + MutexLock l(&mutex_); + Slice in = property; + Slice prefix("leveldb."); + if (!in.starts_with(prefix)) return false; + in.remove_prefix(prefix.size()); + + if (in.starts_with("num-files-at-level")) { + in.remove_prefix(strlen("num-files-at-level")); + uint64_t level; + bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); + if (!ok || level < 0 || level >= config::kNumLevels) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", versions_->NumLevelFiles(level)); + *value = buf; + return true; + } + } else if (in == "stats") { + char buf[200]; + snprintf(buf, sizeof(buf), + " Compactions\n" + "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n" + "--------------------------------------------------\n" + ); + value->append(buf); + for (int level = 0; level < config::kNumLevels; level++) { + int files = versions_->NumLevelFiles(level); + if (stats_[level].micros > 0 || files > 0) { + snprintf( + buf, sizeof(buf), + "%3d %8d %8.0f %9.0f %8.0f %9.0f\n", + level, + files, + versions_->NumLevelBytes(level) / 1048576.0, + stats_[level].micros / 1e6, + stats_[level].bytes_read / 1048576.0, + stats_[level].bytes_written / 1048576.0); + value->append(buf); + } + } + return true; + } + + return false; +} + +void DBImpl::GetApproximateSizes( + const Range* range, int n, + uint64_t* sizes) { + // TODO(opt): better implementation + Version* v; + { + MutexLock l(&mutex_); + versions_->current()->Ref(); + v = versions_->current(); + } + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + uint64_t start = versions_->ApproximateOffsetOf(v, k1); + uint64_t limit = versions_->ApproximateOffsetOf(v, k2); + sizes[i] = (limit >= start ? limit - start : 0); + } + + { + MutexLock l(&mutex_); + v->Unref(); + } +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { + WriteBatch batch; + batch.Put(key, value); + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, const Slice& key) { + WriteBatch batch; + batch.Delete(key); + return Write(opt, &batch); +} + +DB::~DB() { } + +Status DB::Open(const Options& options, const std::string& dbname, + DB** dbptr) { + *dbptr = NULL; + + DBImpl* impl = new DBImpl(options, dbname); + impl->mutex_.Lock(); + VersionEdit edit; + Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + WritableFile* lfile; + s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), + &lfile); + if (s.ok()) { + edit.SetLogNumber(new_log_number); + impl->logfile_ = lfile; + impl->log_ = new log::Writer(lfile); + s = impl->versions_->LogAndApply(&edit, NULL); + } + if (s.ok()) { + impl->DeleteObsoleteFiles(); + } + } + impl->mutex_.Unlock(); + if (s.ok()) { + *dbptr = impl; + } else { + delete impl; + } + return s; +} + +Status DestroyDB(const std::string& dbname, const Options& options) { + Env* env = options.env; + std::vector filenames; + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + if (filenames.empty()) { + return Status::OK(); + } + + FileLock* lock; + Status result = env->LockFile(LockFileName(dbname), &lock); + if (result.ok()) { + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + Status del = env->DeleteFile(dbname + "/" + filenames[i]); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(LockFileName(dbname)); + env->DeleteDir(dbname); // Ignore error in case dir contains other files + } + return result; +} + +} diff --git a/db/db_impl.h b/db/db_impl.h new file mode 100644 index 0000000..1f685f0 --- /dev/null +++ b/db/db_impl.h @@ -0,0 +1,207 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ +#define STORAGE_LEVELDB_DB_DB_IMPL_H_ + +#include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "port/port.h" + +namespace leveldb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +class DBImpl : public DB { + public: + DBImpl(const Options& options, const std::string& dbname); + virtual ~DBImpl(); + + // Implementations of the DB interface + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); + virtual Status Delete(const WriteOptions&, const Slice& key); + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value); + virtual Iterator* NewIterator(const ReadOptions&); + virtual const Snapshot* GetSnapshot(); + virtual void ReleaseSnapshot(const Snapshot* snapshot); + virtual bool GetProperty(const Slice& property, std::string* value); + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); + + // Extra methods (for testing) that are not in the public DB interface + + // Compact any files in the named level that overlap [begin,end] + void TEST_CompactRange( + int level, + const std::string& begin, + const std::string& end); + + // Force current memtable contents to be compacted. + Status TEST_CompactMemTable(); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + Iterator* TEST_NewInternalIterator(); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes(); + + private: + friend class DB; + + Iterator* NewInternalIterator(const ReadOptions&, + SequenceNumber* latest_snapshot); + + Status NewDB(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + Status Recover(VersionEdit* edit); + + void MaybeIgnoreError(Status* s) const; + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + + // Called when an iterator over a particular version of the + // descriptor goes away. + static void Unref(void* arg1, void* arg2); + + // Compact the in-memory write buffer to disk. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. + Status CompactMemTable(); + + Status RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence); + + Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); + + Status MakeRoomForWrite(bool force /* compact even if there is room? */); + bool HasLargeValues(const WriteBatch& batch) const; + + // Process data in "*updates" and return a status. "assigned_seq" + // is the sequence number assigned to the first mod in "*updates". + // If no large values are encountered, "*final" is set to "updates". + // If large values were encountered, registers the references of the + // large values with the VersionSet, writes the large values to + // files (if appropriate), and allocates a new WriteBatch with the + // large values replaced with indirect references and stores a + // pointer to the new WriteBatch in *final. If *final != updates on + // return, then the client should delete *final when no longer + // needed. Returns OK on success, and an appropriate error + // otherwise. + Status HandleLargeValues(SequenceNumber assigned_seq, + WriteBatch* updates, + WriteBatch** final); + + // Helper routine for HandleLargeValues + void MaybeCompressLargeValue( + const Slice& raw_value, + Slice* file_bytes, + std::string* scratch, + LargeValueRef* ref); + + struct CompactionState; + + void MaybeScheduleCompaction(); + static void BGWork(void* db); + void BackgroundCall(); + void BackgroundCompaction(); + void CleanupCompaction(CompactionState* compact); + Status DoCompactionWork(CompactionState* compact); + + Status OpenCompactionOutputFile(CompactionState* compact); + Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); + Status InstallCompactionResults(CompactionState* compact); + + // Constant after construction + Env* const env_; + const InternalKeyComparator internal_comparator_; + const Options options_; // options_.comparator == &internal_comparator_ + bool owns_info_log_; + bool owns_cache_; + const std::string dbname_; + + // table_cache_ provides its own synchronization + TableCache* table_cache_; + + // Lock over the persistent DB state. Non-NULL iff successfully acquired. + FileLock* db_lock_; + + // State below is protected by mutex_ + port::Mutex mutex_; + port::AtomicPointer shutting_down_; + port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ + port::CondVar compacting_cv_; // Signalled when !compacting_ + MemTable* mem_; + MemTable* imm_; // Memtable being compacted + port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ + WritableFile* logfile_; + log::Writer* log_; + SnapshotList snapshots_; + + // Set of table files to protect from deletion because they are + // part of ongoing compactions. + std::set pending_outputs_; + + // Has a background compaction been scheduled or is running? + bool bg_compaction_scheduled_; + + // Is there a compaction running? + bool compacting_; + + VersionSet* versions_; + + // Have we encountered a background error in paranoid mode? + Status bg_error_; + + // Per level compaction stats. stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + int64_t micros; + int64_t bytes_read; + int64_t bytes_written; + + CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { } + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->bytes_read += c.bytes_read; + this->bytes_written += c.bytes_written; + } + }; + CompactionStats stats_[config::kNumLevels]; + + // No copying allowed + DBImpl(const DBImpl&); + void operator=(const DBImpl&); + + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } +}; + +// Sanitize db options. The caller should delete result.info_log if +// it is not equal to src.info_log. +extern Options SanitizeOptions(const std::string& db, + const InternalKeyComparator* icmp, + const Options& src); + +} + +#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/db/db_iter.cc b/db/db_iter.cc new file mode 100644 index 0000000..31c2a38 --- /dev/null +++ b/db/db_iter.cc @@ -0,0 +1,397 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" + +#include "db/filename.h" +#include "db/dbformat.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/mutexlock.h" + +namespace leveldb { + +#if 0 +static void DumpInternalIter(Iterator* iter) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey k; + if (!ParseInternalKey(iter->key(), &k)) { + fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); + } else { + fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); + } + } +} +#endif + +namespace { + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter: public Iterator { + public: + // Which direction is the iterator currently moving? + // (1) When moving forward, the internal iterator is positioned at + // the exact entry that yields this->key(), this->value() + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { + kForward, + kReverse + }; + + DBIter(const std::string* dbname, Env* env, + const Comparator* cmp, Iterator* iter, SequenceNumber s) + : dbname_(dbname), + env_(env), + user_comparator_(cmp), + iter_(iter), + sequence_(s), + large_(NULL), + direction_(kForward), + valid_(false) { + } + virtual ~DBIter() { + delete iter_; + delete large_; + } + virtual bool Valid() const { return valid_; } + virtual Slice key() const { + assert(valid_); + return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_; + } + virtual Slice value() const { + assert(valid_); + Slice raw_value = (direction_ == kForward) ? iter_->value() : saved_value_; + if (large_ == NULL) { + return raw_value; + } else { + MutexLock l(&large_->mutex); + if (!large_->produced) { + ReadIndirectValue(raw_value); + } + return large_->value; + } + } + virtual Status status() const { + if (status_.ok()) { + if (large_ != NULL && !large_->status.ok()) return large_->status; + return iter_->status(); + } else { + return status_; + } + } + + virtual void Next(); + virtual void Prev(); + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + + private: + struct Large { + port::Mutex mutex; + std::string value; + bool produced; + Status status; + }; + + void FindNextUserEntry(bool skipping, std::string* skip); + void FindPrevUserEntry(); + bool ParseKey(ParsedInternalKey* key); + void ReadIndirectValue(Slice ref) const; + + inline void SaveKey(const Slice& k, std::string* dst) { + dst->assign(k.data(), k.size()); + } + + inline void ForgetLargeValue() { + if (large_ != NULL) { + delete large_; + large_ = NULL; + } + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + const std::string* const dbname_; + Env* const env_; + const Comparator* const user_comparator_; + Iterator* const iter_; + SequenceNumber const sequence_; + + Status status_; + std::string saved_key_; // == current key when direction_==kReverse + std::string saved_value_; // == current raw value when direction_==kReverse + Large* large_; // Non-NULL if value is an indirect reference + Direction direction_; + bool valid_; + + // No copying allowed + DBIter(const DBIter&); + void operator=(const DBIter&); +}; + +inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { + if (!ParseInternalKey(iter_->key(), ikey)) { + status_ = Status::Corruption("corrupted internal key in DBIter"); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + ForgetLargeValue(); + + if (direction_ == kReverse) { // Switch directions? + direction_ = kForward; + // iter_ is pointing just before the entries for this->key(), + // so advance into the range of entries for this->key() and then + // use the normal skipping code below. + if (!iter_->Valid()) { + iter_->SeekToFirst(); + } else { + iter_->Next(); + } + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + return; + } + } + + // Temporarily use saved_key_ as storage for key to skip. + std::string* skip = &saved_key_; + SaveKey(ExtractUserKey(iter_->key()), skip); + FindNextUserEntry(true, skip); +} + +void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { + // Loop until we hit an acceptable entry to yield + assert(iter_->Valid()); + assert(direction_ == kForward); + assert(large_ == NULL); + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + SaveKey(ikey.user_key, skip); + skipping = true; + break; + case kTypeValue: + case kTypeLargeValueRef: + if (skipping && + user_comparator_->Compare(ikey.user_key, *skip) <= 0) { + // Entry hidden + } else { + valid_ = true; + saved_key_.clear(); + if (ikey.type == kTypeLargeValueRef) { + large_ = new Large; + large_->produced = false; + } + return; + } + break; + } + } + iter_->Next(); + } while (iter_->Valid()); + saved_key_.clear(); + valid_ = false; +} + +void DBIter::Prev() { + assert(valid_); + ForgetLargeValue(); + + if (direction_ == kForward) { // Switch directions? + // iter_ is pointing at the current entry. Scan backwards until + // the key changes so we can use the normal reverse scanning code. + assert(iter_->Valid()); // Otherwise valid_ would have been false + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + while (true) { + iter_->Prev(); + if (!iter_->Valid()) { + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + return; + } + if (user_comparator_->Compare(ExtractUserKey(iter_->key()), + saved_key_) < 0) { + break; + } + } + direction_ = kReverse; + } + + FindPrevUserEntry(); +} + +void DBIter::FindPrevUserEntry() { + assert(direction_ == kReverse); + assert(large_ == NULL); + + ValueType value_type = kTypeDeletion; + if (iter_->Valid()) { + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if ((value_type != kTypeDeletion) && + user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { + // We encountered a non-deleted value in entries for previous keys, + break; + } + value_type = ikey.type; + if (value_type == kTypeDeletion) { + ClearSavedValue(); + } else { + Slice raw_value = iter_->value(); + if (saved_value_.capacity() > raw_value.size() + 1048576) { + std::string empty; + swap(empty, saved_value_); + } + saved_value_.assign(raw_value.data(), raw_value.size()); + } + } + iter_->Prev(); + } while (iter_->Valid()); + } + + if (value_type == kTypeDeletion) { + // End + valid_ = false; + saved_key_.clear(); + ClearSavedValue(); + direction_ = kForward; + } else { + valid_ = true; + if (value_type == kTypeLargeValueRef) { + large_ = new Large; + large_->produced = false; + } + } +} + +void DBIter::Seek(const Slice& target) { + direction_ = kForward; + ForgetLargeValue(); + ClearSavedValue(); + saved_key_.clear(); + AppendInternalKey( + &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); + iter_->Seek(saved_key_); + if (iter_->Valid()) { + FindNextUserEntry(false, &saved_key_ /* temporary storage */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToFirst() { + direction_ = kForward; + ForgetLargeValue(); + ClearSavedValue(); + iter_->SeekToFirst(); + if (iter_->Valid()) { + FindNextUserEntry(false, &saved_key_ /* temporary storage */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToLast() { + direction_ = kReverse; + ForgetLargeValue(); + ClearSavedValue(); + iter_->SeekToLast(); + FindPrevUserEntry(); +} + +void DBIter::ReadIndirectValue(Slice ref) const { + assert(!large_->produced); + large_->produced = true; + LargeValueRef large_ref; + if (ref.size() != LargeValueRef::ByteSize()) { + large_->status = Status::Corruption("malformed large value reference"); + return; + } + memcpy(large_ref.data, ref.data(), LargeValueRef::ByteSize()); + std::string fname = LargeValueFileName(*dbname_, large_ref); + RandomAccessFile* file; + Status s = env_->NewRandomAccessFile(fname, &file); + uint64_t file_size = 0; + if (s.ok()) { + s = env_->GetFileSize(fname, &file_size); + } + if (s.ok()) { + uint64_t value_size = large_ref.ValueSize(); + large_->value.resize(value_size); + Slice result; + s = file->Read(0, file_size, &result, + const_cast(large_->value.data())); + if (s.ok()) { + if (result.size() == file_size) { + switch (large_ref.compression_type()) { + case kNoCompression: { + if (result.data() != large_->value.data()) { + large_->value.assign(result.data(), result.size()); + } + break; + } + case kSnappyCompression: { + std::string uncompressed; + if (port::Snappy_Uncompress(result.data(), result.size(), + &uncompressed) && + uncompressed.size() == large_ref.ValueSize()) { + swap(uncompressed, large_->value); + } else { + s = Status::Corruption( + "Unable to read entire compressed large value file"); + } + } + } + } else { + s = Status::Corruption("Unable to read entire large value file"); + } + } + delete file; // Ignore errors on closing + } + if (!s.ok()) { + large_->value.clear(); + large_->status = s; + } +} + +} // anonymous namespace + +Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence) { + return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); +} + +} diff --git a/db/db_iter.h b/db/db_iter.h new file mode 100644 index 0000000..195f3d3 --- /dev/null +++ b/db/db_iter.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ +#define STORAGE_LEVELDB_DB_DB_ITER_H_ + +#include +#include "leveldb/db.h" +#include "db/dbformat.h" + +namespace leveldb { + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified "sequence" number +// into appropriate user keys. +extern Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence); + +} + +#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/db/db_test.cc b/db/db_test.cc new file mode 100644 index 0000000..04de331 --- /dev/null +++ b/db/db_test.cc @@ -0,0 +1,1211 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +class DBTest { + public: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + DBTest() : env_(Env::Default()) { + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, Options()); + db_ = NULL; + Reopen(); + } + + ~DBTest() { + delete db_; + DestroyDB(dbname_, Options()); + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); + } + + void DestroyAndReopen(Options* options = NULL) { + delete db_; + db_ = NULL; + DestroyDB(dbname_, Options()); + ASSERT_OK(TryReopen(options)); + } + + Status TryReopen(Options* options) { + delete db_; + db_ = NULL; + Options opts; + if (options != NULL) { + opts = *options; + } else { + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const std::string& k, const std::string& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string AllEntriesFor(const Slice& user_key) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (last_options_.comparator->Compare( + ikey.user_key, user_key) != 0) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeLargeValueRef: + result += "LARGEVALUE(" + EscapeString(iter->value()) + ")"; + break; + case kTypeDeletion: + result += "DEL"; + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + delete iter; + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + std::set LargeValueFiles() const { + // Return the set of large value files that exist in the database + std::vector filenames; + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + uint64_t number; + LargeValueRef large_ref; + FileType type; + std::set live; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type) && + type == kLargeValueFile) { + fprintf(stderr, " live: %s\n", + LargeValueRefToFilenameString(large_ref).c_str()); + live.insert(large_ref); + } + } + fprintf(stderr, "Found %d live large value files\n", (int)live.size()); + return live; + } + + void Compact(const Slice& start, const Slice& limit) { + dbfull()->TEST_CompactMemTable(); + int max_level_with_files = 1; + for (int level = 1; level < config::kNumLevels; level++) { + if (NumTableFilesAtLevel(level) > 0) { + max_level_with_files = level; + } + } + for (int level = 0; level < max_level_with_files; level++) { + dbfull()->TEST_CompactRange(level, "", "~"); + } + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < config::kNumLevels; level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(DBTest, Empty) { + ASSERT_TRUE(db_ != NULL); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(DBTest, ReadWrite) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); +} + +TEST(DBTest, PutDeleteGet) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(db_->Delete(WriteOptions(), "foo")); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(DBTest, IterEmpty) { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterSingle) { + ASSERT_OK(Put("a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterMulti) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put("a", "va2")); + ASSERT_OK(Put("a2", "va3")); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Put("c", "vc2")); + ASSERT_OK(Delete("b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, IterSmallAndLargeMix) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", std::string(100000, 'b'))); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Put("d", std::string(100000, 'd'))); + ASSERT_OK(Put("e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; +} + +TEST(DBTest, Recover) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v5", Get("baz")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + + Reopen(); + ASSERT_EQ("v3", Get("foo")); + ASSERT_OK(Put("foo", "v4")); + ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v5", Get("baz")); +} + +TEST(DBTest, RecoveryWithEmptyLog) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + Reopen(); + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); +} + +TEST(DBTest, MinorCompactionsHappen) { + Options options; + options.write_buffer_size = 10000; + Reopen(&options); + + const int N = 500; + + int starting_num_tables = NumTableFilesAtLevel(0); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = NumTableFilesAtLevel(0); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } + + Reopen(); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } +} + +TEST(DBTest, RecoverWithLargeLog) { + { + Options options; + options.large_value_threshold = 1048576; + Reopen(&options); + ASSERT_OK(Put("big1", std::string(200000, '1'))); + ASSERT_OK(Put("big2", std::string(200000, '2'))); + ASSERT_OK(Put("small3", std::string(10, '3'))); + ASSERT_OK(Put("small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + options.large_value_threshold = 1048576; + Reopen(&options); + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + ASSERT_EQ(std::string(200000, '1'), Get("big1")); + ASSERT_EQ(std::string(200000, '2'), Get("big2")); + ASSERT_EQ(std::string(10, '3'), Get("small3")); + ASSERT_EQ(std::string(10, '4'), Get("small4")); + ASSERT_GT(NumTableFilesAtLevel(0), 1); +} + +TEST(DBTest, CompactionsGenerateMultipleFiles) { + Options options; + options.write_buffer_size = 100000000; // Large write buffer + options.large_value_threshold = 1048576; + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to level-0 + Reopen(&options); + dbfull()->TEST_CompactRange(0, "", Key(100000)); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST(DBTest, SparseMerge) { + Options options; + options.compression = kNoCompression; + Reopen(&options); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put("A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + Put("C", "vc"); + Compact("", "z"); + + // Make sparse update + Put("A", "va2"); + Put("B100", "bvalue2"); + Put("C", "vc2"); + dbfull()->TEST_CompactMemTable(); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST(DBTest, ApproximateSizes) { + for (int test = 0; test < 2; test++) { + // test==0: default large_value_threshold + // test==1: 1 MB large_value_threshold + Options options; + options.large_value_threshold = (test == 0) ? 65536 : 1048576; + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + DestroyAndReopen(); + + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + Reopen(&options); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + const int N = 80; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); + } + if (test == 1) { + // 0 because GetApproximateSizes() does not account for memtable space for + // non-large values + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + } else { + ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000)); + ASSERT_TRUE(Between(Size(Key(20), Key(30)), + 100000*10, 100000*10 + 10000)); + } + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); + ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), + 100000 * (i+1), 100000 * (i+1) + 10000)); + ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), + 100000 * 10, 100000 * 10 + 10000)); + } + ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); + ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); + + dbfull()->TEST_CompactRange(0, + Key(compact_start), + Key(compact_start + 9)); + } + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + } + } +} + +TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + Options options; + options.large_value_threshold = 65536; + options.compression = kNoCompression; + Reopen(); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(2), big1)); + ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(4), big1)); + ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, Key(0), Key(100)); + } +} + +TEST(DBTest, IteratorPinsRef) { + Put("foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Write to force compactions + Put("foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values + } + Put("foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; +} + +TEST(DBTest, Snapshot) { + Put("foo", "v1"); + const Snapshot* s1 = db_->GetSnapshot(); + Put("foo", "v2"); + const Snapshot* s2 = db_->GetSnapshot(); + Put("foo", "v3"); + const Snapshot* s3 = db_->GetSnapshot(); + + Put("foo", "v4"); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v3", Get("foo", s3)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s3); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ("v4", Get("foo")); +} + +TEST(DBTest, HiddenValuesAreRemoved) { + Random rnd(301); + std::string big = RandomString(&rnd, 50000); + Put("foo", big); + Put("pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "tiny"); + Put("pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + + ASSERT_EQ(big, Get("foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); + dbfull()->TEST_CompactRange(0, "", "x"); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GE(NumTableFilesAtLevel(1), 1); + dbfull()->TEST_CompactRange(1, "", "x"); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + + ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); +} + +TEST(DBTest, DeletionMarkers1) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + Delete("foo"); + Put("foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + dbfull()->TEST_CompactRange(0, "", "z"); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(1, "", "z"); + // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. + // (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); +} + +TEST(DBTest, DeletionMarkers2) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(1, "", "z"); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(0, "", "z"); + // DEL kept: L2 file overlaps + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(1, "", "z"); + // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. + // (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); +} + +TEST(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "leveldb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + NewComparator cmp; + Options new_options; + new_options.comparator = &cmp; + Status s = TryReopen(&new_options); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); +} + +static bool LargeValuesOK(DBTest* db, + const std::set& expected) { + std::set actual = db->LargeValueFiles(); + if (actual.size() != expected.size()) { + fprintf(stderr, "Sets differ in size: %d vs %d\n", + (int)actual.size(), (int)expected.size()); + return false; + } + for (std::set::const_iterator it = expected.begin(); + it != expected.end(); + ++it) { + if (actual.count(*it) != 1) { + fprintf(stderr, " key '%s' not found in actual set\n", + LargeValueRefToFilenameString(*it).c_str()); + return false; + } + } + return true; +} + +TEST(DBTest, LargeValues1) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + + std::string big1; + test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible + std::set expected; + + ASSERT_OK(Put("big1", big1)); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Delete("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + // No handling of deletion markers on memtable compactions, so big1 remains + ASSERT_TRUE(LargeValuesOK(this, expected)); + + dbfull()->TEST_CompactRange(0, "", "z"); + expected.erase(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + +static bool SnappyCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(in.data(), in.size(), &out); +} + +TEST(DBTest, LargeValues2) { + Options options; + options.large_value_threshold = 10000; + Reopen(&options); + + Random rnd(301); + + std::string big1, big2; + test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible + test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible + std::set expected; + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big1", big1)); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_EQ(big1, Get("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big2", big2)); + ASSERT_EQ(big2, Get("big2")); + if (SnappyCompressionSupported()) { + expected.insert(LargeValueRef::Make(big2, kSnappyCompression)); + } else { + expected.insert(LargeValueRef::Make(big2, kNoCompression)); + } + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Put("big2", big2)); + ASSERT_OK(Put("big2_b", big2)); + ASSERT_EQ(big1, Get("big1")); + ASSERT_EQ(big2, Get("big2")); + ASSERT_EQ(big2, Get("big2_b")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(Delete("big1")); + ASSERT_EQ("NOT_FOUND", Get("big1")); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(0, "", "z"); + expected.erase(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(1, "", "z"); + + ASSERT_OK(Delete("big2")); + ASSERT_EQ("NOT_FOUND", Get("big2")); + ASSERT_EQ(big2, Get("big2_b")); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_TRUE(LargeValuesOK(this, expected)); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); + + // Make sure the large value refs survive a reload and compactions after + // the reload. + Reopen(); + ASSERT_TRUE(LargeValuesOK(this, expected)); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + dbfull()->TEST_CompactRange(0, "", "z"); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + +TEST(DBTest, LargeValues3) { + // Make sure we don't compress values if + Options options; + options.large_value_threshold = 10000; + options.compression = kNoCompression; + Reopen(&options); + + Random rnd(301); + + std::string big1 = std::string(100000, 'x'); // Very compressible + std::set expected; + + ASSERT_OK(Put("big1", big1)); + ASSERT_EQ(big1, Get("big1")); + expected.insert(LargeValueRef::Make(big1, kNoCompression)); + ASSERT_TRUE(LargeValuesOK(this, expected)); +} + + +TEST(DBTest, DBOpen_Options) { + std::string dbname = test::TmpDir() + "/db_options_test"; + DestroyDB(dbname, Options()); + + // Does not exist, and create_if_missing == false: error + DB* db = NULL; + Options opts; + opts.create_if_missing = false; + Status s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); + ASSERT_TRUE(db == NULL); + + // Does not exist, and create_if_missing == true: OK + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; + + // Does exist, and error_if_exists == true: error + opts.create_if_missing = false; + opts.error_if_exists = true; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); + ASSERT_TRUE(db == NULL); + + // Does exist, and error_if_exists == false: OK + opts.create_if_missing = true; + opts.error_if_exists = false; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; +} + +class ModelDB: public DB { + public: + explicit ModelDB(const Options& options): options_(options) { } + ~ModelDB() { } + virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { + return DB::Put(o, k, v); + } + virtual Status Delete(const WriteOptions& o, const Slice& key) { + return DB::Delete(o, key); + } + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) { + assert(false); // Not implemented + return Status::NotFound(key); + } + virtual Iterator* NewIterator(const ReadOptions& options) { + if (options.snapshot == NULL) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + reinterpret_cast(options.snapshot->number_); + return new ModelIter(snapshot_state, false); + } + } + virtual const Snapshot* GetSnapshot() { + KVMap* saved = new KVMap; + *saved = map_; + return snapshots_.New( + reinterpret_cast(saved)); + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) { + const KVMap* saved = reinterpret_cast(snapshot->number_); + delete saved; + snapshots_.Delete(snapshot); + } + virtual Status Write(const WriteOptions& options, WriteBatch* batch) { + assert(options.post_write_snapshot == NULL); // Not supported + for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeValue: + map_[it.key().ToString()] = it.value().ToString(); + break; + case kTypeLargeValueRef: + assert(false); // Should not occur + break; + case kTypeDeletion: + map_.erase(it.key().ToString()); + break; + } + } + return Status::OK(); + } + + virtual bool GetProperty(const Slice& property, std::string* value) { + return false; + } + virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + private: + typedef std::map KVMap; + class ModelIter: public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) { + } + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() { ++iter_; } + virtual void Prev() { --iter_; } + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + SnapshotList snapshots_; +}; + +static std::string RandomKey(Random* rnd) { + int len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, + DB* model, + DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); + miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); + delete miter; + delete dbiter; + return ok; +} + +TEST(DBTest, Randomized) { + Random rnd(test::RandomSeed()); + ModelDB model(last_options_); + const int N = 10000; + const Snapshot* model_snap = NULL; + const Snapshot* db_snap = NULL; + std::string k, v; + for (int step = 0; step < N; step++) { + if (step % 100 == 0) { + fprintf(stderr, "Step %d of %d\n", step, N); + } + int p = rnd.Uniform(100); + if (p < 45) { // Put + k = RandomKey(&rnd); + v = RandomString(&rnd, + rnd.OneIn(20) + ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + + } else if (p < 90) { // Delete + k = RandomKey(&rnd); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + + + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); + + Reopen(); + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/dbformat.cc b/db/dbformat.cc new file mode 100644 index 0000000..2664eb4 --- /dev/null +++ b/db/dbformat.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "db/dbformat.h" +#include "port/port.h" +#include "util/coding.h" + +namespace leveldb { + +static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(t <= kValueTypeForSeek); + return (seq << 8) | t; +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +std::string ParsedInternalKey::DebugString() const { + char buf[50]; + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); + std::string result = "'"; + result += user_key.ToString(); + result += buf; + return result; +} + +const char* InternalKeyComparator::Name() const { + return "leveldb.InternalKeyComparator"; +} + +int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +void InternalKeyComparator::FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + user_comparator_->FindShortestSeparator(&tmp, user_limit); + if (user_comparator_->Compare(*start, tmp) < 0) { + // User key has become larger. Tack on the earliest possible + // number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*start, tmp) < 0); + assert(this->Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void InternalKeyComparator::FindShortSuccessor(std::string* key) const { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + user_comparator_->FindShortSuccessor(&tmp); + if (user_comparator_->Compare(user_key, tmp) < 0) { + // User key has become larger. Tack on the earliest possible + // number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) { + LargeValueRef result; + port::SHA1_Hash(value.data(), value.size(), &result.data[0]); + EncodeFixed64(&result.data[20], value.size()); + result.data[28] = static_cast(ctype); + return result; +} + +std::string LargeValueRefToFilenameString(const LargeValueRef& h) { + assert(sizeof(h.data) == LargeValueRef::ByteSize()); + assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf + static const char tohex[] = "0123456789abcdef"; + char buf[20*2]; + for (int i = 0; i < 20; i++) { + buf[2*i] = tohex[(h.data[i] >> 4) & 0xf]; + buf[2*i+1] = tohex[h.data[i] & 0xf]; + } + std::string result = std::string(buf, sizeof(buf)); + result += "-"; + result += NumberToString(h.ValueSize()); + result += "-"; + result += NumberToString(static_cast(h.compression_type())); + return result; +} + +static uint32_t hexvalue(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } else if (c >= 'A' && c <= 'F') { + return 10 + c - 'A'; + } else { + assert(c >= 'a' && c <= 'f'); + return 10 + c - 'a'; + } +} + +bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) { + Slice in = s; + if (in.size() < 40) { + return false; + } + for (int i = 0; i < 20; i++) { + if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) { + return false; + } + unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]); + h->data[i] = c; + } + in.remove_prefix(40); + uint64_t value_size, ctype; + + if (ConsumeChar(&in, '-') && + ConsumeDecimalNumber(&in, &value_size) && + ConsumeChar(&in, '-') && + ConsumeDecimalNumber(&in, &ctype) && + in.empty() && + (ctype <= kSnappyCompression)) { + EncodeFixed64(&h->data[20], value_size); + h->data[28] = static_cast(ctype); + return true; + } else { + return false; + } +} + +} diff --git a/db/dbformat.h b/db/dbformat.h new file mode 100644 index 0000000..5f117f9 --- /dev/null +++ b/db/dbformat.h @@ -0,0 +1,204 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ +#define STORAGE_LEVELDB_DB_FORMAT_H_ + +#include +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/slice.h" +#include "leveldb/table_builder.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +// Grouping of constants. We may want to make some of these +// parameters set via options. +namespace config { +static const int kNumLevels = 7; +} + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +enum ValueType { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeLargeValueRef = 0x2, +}; +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeLargeValueRef; + +typedef uint64_t SequenceNumber; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString() const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte +// uncompressed size, and a 1 byte CompressionType code. An +// encoded form of it is embedded in the filenames of large value +// files stored in the database, and the raw binary form is stored as +// the iter->value() result for values of type kTypeLargeValueRef in +// the table and log files that make up the database. +struct LargeValueRef { + char data[29]; + + // Initialize a large value ref for the given data + static LargeValueRef Make(const Slice& data, + CompressionType compression_type); + + // Initialize a large value ref from a serialized, 29-byte reference value + static LargeValueRef FromRef(const Slice& ref) { + LargeValueRef result; + assert(ref.size() == sizeof(result.data)); + memcpy(result.data, ref.data(), sizeof(result.data)); + return result; + } + + // Return the number of bytes in a LargeValueRef (not the + // number of bytes in the value referenced). + static size_t ByteSize() { return sizeof(LargeValueRef().data); } + + // Return the number of bytes in the value referenced by "*this". + uint64_t ValueSize() const { return DecodeFixed64(&data[20]); } + + CompressionType compression_type() const { + return static_cast(data[28]); + } + + bool operator==(const LargeValueRef& b) const { + return memcmp(data, b.data, sizeof(data)) == 0; + } + bool operator<(const LargeValueRef& b) const { + return memcmp(data, b.data, sizeof(data)) < 0; + } +}; + +// Convert the large value ref to a human-readable string suitable +// for embedding in a large value filename. +extern std::string LargeValueRefToFilenameString(const LargeValueRef& h); + +// Parse the large value filename string in "input" and store it in +// "*h". If successful, returns true. Otherwise returns false. +extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref); + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast(kTypeLargeValueRef)); +} + +} + +#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc new file mode 100644 index 0000000..702cbb4 --- /dev/null +++ b/db/dbformat_test.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +static std::string IKey(const std::string& user_key, + uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); + return result; +} + +static void TestKey(const std::string& key, + uint64_t seq, + ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); +} + +class FormatTest { }; + +TEST(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; + const uint64_t seq[] = { + 1, 2, 3, + (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, + (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, + (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 + }; + for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeDeletion))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeLargeValueRef))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), + IKey("hello", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ(IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), + IKey("foo", 200, kTypeValue))); +} + +TEST(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +TEST(FormatTest, SHA1) { + // Check that we are computing the same value as sha1. + // Note that the last two numbers are the length of the input and the + // compression type. + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr + LargeValueRefToFilenameString( + LargeValueRef::Make("hello", kNoCompression))); + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr + LargeValueRefToFilenameString( + LargeValueRef::Make("hello", kSnappyCompression))); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/filename.cc b/db/filename.cc new file mode 100644 index 0000000..d21918c --- /dev/null +++ b/db/filename.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "db/filename.h" +#include "db/dbformat.h" +#include "leveldb/env.h" +#include "util/logging.h" + +namespace leveldb { + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast(number), + suffix); + return name + buf; +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string TableFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "sst"); +} + +std::string LargeValueFileName(const std::string& name, + const LargeValueRef& large_ref) { + std::string result = name + "/"; + result += LargeValueRefToFilenameString(large_ref); + result += ".val"; + return result; +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + static_cast(number)); + return dbname + buf; +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/CURRENT"; +} + +std::string LockFileName(const std::string& dbname) { + return dbname + "/LOCK"; +} + +std::string TempFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + return MakeFileName(dbname, number, "dbtmp"); +} + +std::string InfoLogFileName(const std::string& dbname) { + return dbname + "/LOG"; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname) { + return dbname + "/LOG.old"; +} + + +// Owned filenames have the form: +// dbname/CURRENT +// dbname/LOCK +// dbname/LOG +// dbname/LOG.old +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val +// dbname/[0-9]+.(log|sst) +bool ParseFileName(const std::string& fname, + uint64_t* number, + LargeValueRef* large_ref, + FileType* type) { + Slice rest(fname); + if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (rest == "LOG" || rest == "LOG.old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.size() >= 4 && + Slice(rest.data() + rest.size() - 4, 4) == ".val") { + LargeValueRef h; + if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4), + &h)) { + return false; + } + *large_ref = h; + *type = kLargeValueFile; + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + Slice suffix = rest; + if (suffix == Slice(".log")) { + *type = kLogFile; + } else if (suffix == Slice(".sst")) { + *type = kTableFile; + } else if (suffix == Slice(".dbtmp")) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); + if (s.ok()) { + s = env->RenameFile(tmp, CurrentFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +} diff --git a/db/filename.h b/db/filename.h new file mode 100644 index 0000000..81ab2fc --- /dev/null +++ b/db/filename.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ +#define STORAGE_LEVELDB_DB_FILENAME_H_ + +#include +#include +#include "leveldb/slice.h" +#include "leveldb/status.h" +#include "port/port.h" + +namespace leveldb { + +class Env; +struct LargeValueRef; + +enum FileType { + kLogFile, + kDBLockFile, + kTableFile, + kLargeValueFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one +}; + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::string& dbname, uint64_t number); + +// Return the name of the large value file with the specified large +// value reference in the db named by "dbname". The result will be +// prefixed with "dbname". +extern std::string LargeValueFileName(const std::string& dbname, + const LargeValueRef& large_ref); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname); + +// If filename is a leveldb file, store the type of the file in *type. +// If *type is kLargeValueFile, then the large value reference data +// from the filename is stored in "*large_ref. For all other types of +// files, the number encoded in the filename is stored in *number. If +// the filename was successfully parsed, returns true. Else return +// false. +extern bool ParseFileName(const std::string& filename, + uint64_t* number, + LargeValueRef* large_ref, + FileType* type); + +// Make the CURRENT file point to the descriptor file with the +// specified number. +extern Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number); + + +} + +#endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/db/filename_test.cc b/db/filename_test.cc new file mode 100644 index 0000000..4d2a91e --- /dev/null +++ b/db/filename_test.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +class FileNameTest { }; + +TEST(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + LargeValueRef large_ref; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + const char* large_ref; + FileType type; + } cases[] = { + { "100.log", 100, "", kLogFile }, + { "0.log", 0, "", kLogFile }, + { "0.sst", 0, "", kTableFile }, + { "CURRENT", 0, "", kCurrentFile }, + { "LOCK", 0, "", kDBLockFile }, + { "MANIFEST-2", 2, "", kDescriptorFile }, + { "MANIFEST-7", 7, "", kDescriptorFile }, + { "LOG", 0, "", kInfoLogFile }, + { "LOG.old", 0, "", kInfoLogFile }, + { "18446744073709551615.log", 18446744073709551615ull, "", + kLogFile }, + { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0, + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile }, + { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0, + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0", + kLargeValueFile }, + }; + for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + std::string f = cases[i].fname; + ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f; + ASSERT_EQ(cases[i].type, type) << f; + if (type == kLargeValueFile) { + ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref)) + << f; + } else { + ASSERT_EQ(cases[i].number, number) << f; + } + } + + // Errors + static const char* errors[] = { + "", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop", + "100.val", + ".val", + "123456789012345678901234567890123456789-12340.val", + "1234567890123456789012345678901234567-123-0.val", + "12345678901234567890123456789012345678902-100-1-.val", + // Overflow on value size + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val", + // '03.val' is a bad compression type + "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" }; + for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f; + }; +} + +TEST(FileNameTest, Construction) { + uint64_t number; + FileType type; + LargeValueRef large_ref; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(0, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(0, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(192, number); + ASSERT_EQ(kLogFile, type); + + fname = TableFileName("bar", 200); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(200, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(100, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_EQ(999, number); + ASSERT_EQ(kTempFile, type); + + for (int i = 0; i <= kSnappyCompression; i++) { + CompressionType ctype = static_cast(i); + std::string value = "abcdef"; + LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype); + fname = LargeValueFileName("tmp", real_large_ref); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(real_large_ref == large_ref); + ASSERT_EQ(kLargeValueFile, type); + ASSERT_EQ(large_ref.compression_type(), ctype); + } +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/log_format.h b/db/log_format.h new file mode 100644 index 0000000..137cd4a --- /dev/null +++ b/db/log_format.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ +#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ + +namespace leveldb { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4, +}; +static const int kMaxRecordType = kLastType; + +static const int kBlockSize = 32768; + +// Header is checksum (4 bytes), type (1 byte), length (2 bytes). +static const int kHeaderSize = 4 + 1 + 2; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/db/log_reader.cc b/db/log_reader.cc new file mode 100644 index 0000000..75e1d28 --- /dev/null +++ b/db/log_reader.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { +namespace log { + +Reader::Reporter::~Reporter() { +} + +Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) + : file_(file), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false) { +} + +Reader::~Reader() { + delete[] backing_store_; +} + +bool Reader::ReadRecord(Slice* record, std::string* scratch) { + scratch->clear(); + record->clear(); + bool in_fragmented_record = false; + + Slice fragment; + while (true) { + switch (ReadPhysicalRecord(&fragment)) { + case kFullType: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + } + scratch->clear(); + *record = fragment; + return true; + + case kFirstType: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + } + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + if (!in_fragmented_record) { + ReportDrop(fragment.size(), "missing start of fragmented record"); + } else { + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + if (!in_fragmented_record) { + ReportDrop(fragment.size(), "missing start of fragmented record"); + } else { + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + return true; + } + break; + + case kEof: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "partial record without end"); + scratch->clear(); + } + return false; + + case kBadRecord: + if (in_fragmented_record) { + ReportDrop(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: + ReportDrop( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + "unknown record type"); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + return false; +} + +void Reader::ReportDrop(size_t bytes, const char* reason) { + if (reporter_ != NULL) { + reporter_->Corruption(bytes, Status::Corruption(reason)); + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result) { + while (true) { + if (buffer_.size() < kHeaderSize) { + if (!eof_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + if (!status.ok()) { + if (reporter_ != NULL) { + reporter_->Corruption(kBlockSize, status); + } + buffer_.clear(); + eof_ = true; + return kEof; + } else if (buffer_.size() < kBlockSize) { + eof_ = true; + } + continue; + } else if (buffer_.size() == 0) { + // End of file + return kEof; + } else { + ReportDrop(buffer_.size(), "truncated record at end of file"); + buffer_.clear(); + return kEof; + } + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + if (kHeaderSize + length > buffer_.size()) { + ReportDrop(buffer_.size(), "bad record length"); + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + buffer_.clear(); + return kBadRecord; + } + + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + ReportDrop(buffer_.size(), "checksum mismatch"); + buffer_.clear(); + return kBadRecord; + } + } + + buffer_.remove_prefix(kHeaderSize + length); + *result = Slice(header + kHeaderSize, length); + return type; + } +} + +} +} diff --git a/db/log_reader.h b/db/log_reader.h new file mode 100644 index 0000000..baf1475 --- /dev/null +++ b/db/log_reader.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ +#define STORAGE_LEVELDB_DB_LOG_READER_H_ + +#include "db/log_format.h" +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class SequentialFile; + +namespace log { + +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-NULL, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + Reader(SequentialFile* file, Reporter* reporter, bool checksum); + + ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + bool ReadRecord(Slice* record, std::string* scratch); + + private: + SequentialFile* const file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + kBadRecord = kMaxRecordType + 2 + }; + + // Return type, or one of the preceding special values + unsigned int ReadPhysicalRecord(Slice* result); + void ReportDrop(size_t bytes, const char* reason); + + // No copying allowed + Reader(const Reader&); + void operator=(const Reader&); +}; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/db/log_test.cc b/db/log_test.cc new file mode 100644 index 0000000..025a5ff --- /dev/null +++ b/db/log_test.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +class LogTest { + private: + class StringDest : public WritableFile { + public: + std::string contents_; + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + virtual Status Append(const Slice& slice) { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + }; + + class StringSource : public SequentialFile { + public: + Slice contents_; + bool force_error_; + bool returned_partial_; + StringSource() : force_error_(false), returned_partial_(false) { } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + ASSERT_EQ(kBlockSize, n); + + if (force_error_) { + force_error_ = false; + returned_partial_ = true; + return Status::Corruption("read error"); + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + *result = Slice(contents_.data(), n); + contents_.remove_prefix(n); + return Status::OK(); + } + }; + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) { } + virtual void Corruption(size_t bytes, const Status& status) { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + StringDest dest_; + StringSource source_; + ReportCollector report_; + bool reading_; + Writer writer_; + Reader reader_; + + public: + LogTest() : reading_(false), + writer_(&dest_), + reader_(&source_, &report_, true/*checksum*/) { + } + + void Write(const std::string& msg) { + ASSERT_TRUE(!reading_) << "Write() after starting to read"; + writer_.AddRecord(Slice(msg)); + } + + size_t WrittenBytes() const { + return dest_.contents_.size(); + } + + std::string Read() { + if (!reading_) { + reading_ = true; + source_.contents_ = Slice(dest_.contents_); + } + std::string scratch; + Slice record; + if (reader_.ReadRecord(&record, &scratch)) { + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, int delta) { + dest_.contents_[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_.contents_[offset] = new_byte; + } + + void ShrinkSize(int bytes) { + dest_.contents_.resize(dest_.contents_.size() - bytes); + } + + void FixChecksum(int header_offset, int len) { + // Compute crc of type/len/data + uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_.contents_[header_offset], crc); + } + + void ForceError() { + source_.force_error_ = true; + } + + size_t DroppedBytes() const { + return report_.dropped_bytes_; + } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } +}; + +TEST(LogTest, Empty) { + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ShortTrailer) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, AlignedEof) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST(LogTest, TruncatedTrailingRecord) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); + ASSERT_EQ("OK", MatchError("truncated record at end of file")); +} + +TEST(LogTest, BadLength) { + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); +} + +TEST(LogTest, ChecksumMismatch) { + Write("foo"); + IncrementByte(0, 10); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(10, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); +} + +TEST(LogTest, UnexpectedMiddleType) { + Write("foo"); + SetByte(6, kMiddleType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedLastType) { + Write("foo"); + SetByte(6, kLastType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + const int dropped = DroppedBytes(); + ASSERT_LE(dropped, 2*kBlockSize + 100); + ASSERT_GE(dropped, 2*kBlockSize); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/log_writer.cc b/db/log_writer.cc new file mode 100644 index 0000000..18ca37a --- /dev/null +++ b/db/log_writer.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include +#include "leveldb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { +namespace log { + +Writer::Writer(WritableFile* dest) + : dest_(dest), + block_offset_(0) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { +} + +Status Writer::AddRecord(const Slice& slice) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + Status s; + do { + const int leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < kHeaderSize) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize being 7) + assert(kHeaderSize == 7); + dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); + } + block_offset_ = 0; + } + + // Invariant: we never leave < kHeaderSize bytes in a block. + const int avail = kBlockSize - block_offset_ - kHeaderSize; + assert(avail >= 0); + + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool begin = (ptr == slice.data()); + const bool end = (left == fragment_length); + if (begin && end) { + type = kFullType; + } else if (begin) { + type = kFirstType; + } else if (end) { + type = kLastType; + } else { + type = kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + } while (s.ok() && left > 0); + return s; +} + +Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { + assert(n <= 0xffff); // Must fit in two bytes + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + // Format the header + char buf[kHeaderSize]; + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + // Compute the crc of the record type and the payload. + uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage + EncodeFixed32(buf, crc); + + // Write the header and the payload + Status s = dest_->Append(Slice(buf, kHeaderSize)); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n)); + if (s.ok()) { + s = dest_->Flush(); + } + } + block_offset_ += kHeaderSize + n; + return s; +} + +} +} diff --git a/db/log_writer.h b/db/log_writer.h new file mode 100644 index 0000000..d3cf27d --- /dev/null +++ b/db/log_writer.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ +#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ + +#include +#include "db/log_format.h" +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class WritableFile; + +namespace log { + +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(WritableFile* dest); + ~Writer(); + + Status AddRecord(const Slice& slice); + + private: + WritableFile* dest_; + int block_offset_; // Current offset in block + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + + // No copying allowed + Writer(const Writer&); + void operator=(const Writer&); +}; + +} +} + +#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/db/memtable.cc b/db/memtable.cc new file mode 100644 index 0000000..a3b618a --- /dev/null +++ b/db/memtable.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" +#include "db/dbformat.h" +#include "leveldb/comparator.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "util/coding.h" + +namespace leveldb { + +static Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len; + const char* p = data; + p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted + return Slice(p, len); +} + +MemTable::MemTable(const InternalKeyComparator& cmp) + : comparator_(cmp), + table_(comparator_, &arena_) { +} + +MemTable::~MemTable() { +} + +size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } + +int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) + const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(aptr); + Slice b = GetLengthPrefixedSlice(bptr); + return comparator.Compare(a, b); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +static const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, target.size()); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator: public Iterator { + public: + explicit MemTableIterator(MemTable::Table* table) { + iter_ = new MemTable::Table::Iterator(table); + } + virtual ~MemTableIterator() { delete iter_; } + + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } + virtual Slice value() const { + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + virtual Status status() const { return Status::OK(); } + + private: + MemTable::Table::Iterator* iter_; + std::string tmp_; // For passing to EncodeKey + + // No copying allowed + MemTableIterator(const MemTableIterator&); + void operator=(const MemTableIterator&); +}; + +Iterator* MemTable::NewIterator() { + return new MemTableIterator(&table_); +} + +void MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, + const Slice& value) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + size_t key_size = key.size(); + size_t val_size = value.size(); + size_t internal_key_size = key_size + 8; + const size_t encoded_len = + VarintLength(internal_key_size) + internal_key_size + + VarintLength(val_size) + val_size; + char* buf = arena_.Allocate(encoded_len); + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + p += key_size; + EncodeFixed64(p, (s << 8) | type); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((p + val_size) - buf == encoded_len); + table_.Insert(buf); +} + +} diff --git a/db/memtable.h b/db/memtable.h new file mode 100644 index 0000000..45b3342 --- /dev/null +++ b/db/memtable.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ +#define STORAGE_LEVELDB_DB_MEMTABLE_H_ + +#include +#include "leveldb/db.h" +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "util/arena.h" + +namespace leveldb { + +class InternalKeyComparator; +class Mutex; +class MemTableIterator; + +class MemTable { + public: + explicit MemTable(const InternalKeyComparator& comparator); + ~MemTable(); + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + size_t ApproximateMemoryUsage(); + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/format.{h,cc} module. + Iterator* NewIterator(); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + void Add(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value); + + private: + struct KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + int operator()(const char* a, const char* b) const; + }; + friend class MemTableIterator; + friend class MemTableBackwardIterator; + + typedef SkipList Table; + + KeyComparator comparator_; + Arena arena_; + Table table_; + + // No copying allowed + MemTable(const MemTable&); + void operator=(const MemTable&); +}; + +} + +#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/db/repair.cc b/db/repair.cc new file mode 100644 index 0000000..014e00e --- /dev/null +++ b/db/repair.cc @@ -0,0 +1,396 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// We recover the contents of the descriptor from the other files we find. +// (1) Any log files are first converted to tables +// (2) We scan every table to compute +// (a) smallest/largest for the table +// (b) large value refs from the table +// (c) largest sequence number in the table +// (3) We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, +// large-value-refs, ...) in the table's meta section to speed up +// ScanTable. + +#include "db/builder.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/env.h" + +namespace leveldb { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const Options& options) + : dbname_(dbname), + env_(options.env), + icmp_(options.comparator), + options_(SanitizeOptions(dbname, &icmp_, options)), + owns_info_log_(options_.info_log != options.info_log), + next_file_number_(1) { + // TableCache can be small since we expect each table to be opened once. + table_cache_ = new TableCache(dbname_, &options_, 10); + } + + ~Repairer() { + delete table_cache_; + if (owns_info_log_) { + delete options_.info_log; + } + } + + Status Run() { + Status status = FindFiles(); + if (status.ok()) { + ConvertLogFilesToTables(); + ExtractMetaData(); + status = WriteDescriptor(); + } + if (status.ok()) { + unsigned long long bytes = 0; + for (int i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.file_size; + } + Log(env_, options_.info_log, + "**** Repaired leveldb %s; " + "recovered %d files; %llu bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), + static_cast(tables_.size()), + bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + SequenceNumber max_sequence; + }; + + std::string const dbname_; + Env* const env_; + InternalKeyComparator const icmp_; + Options const options_; + bool owns_info_log_; + TableCache* table_cache_; + VersionEdit edit_; + + std::vector manifests_; + std::vector table_numbers_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + + Status FindFiles() { + std::vector filenames; + Status status = env_->GetChildren(dbname_, &filenames); + if (!status.ok()) { + return status; + } + if (filenames.empty()) { + return Status::IOError(dbname_, "repair found no files"); + } + + uint64_t number; + LargeValueRef large_ref; + FileType type; + for (int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + if (type == kLargeValueFile) { + // Will be picked up when we process a Table that points to it + } else if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kLogFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_numbers_.push_back(number); + } else { + // Ignore other files + } + } + } + } + return status; + } + + void ConvertLogFilesToTables() { + for (int i = 0; i < logs_.size(); i++) { + std::string logname = LogFileName(dbname_, logs_[i]); + Status status = ConvertLogToTable(logs_[i]); + if (!status.ok()) { + Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", + (unsigned long long) logs_[i], + status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + WritableFile* info_log; + uint64_t lognum; + virtual void Corruption(size_t bytes, const Status& s) { + // We print error messages for corruption, but continue repairing. + Log(env, info_log, "Log #%llu: dropping %d bytes; %s", + (unsigned long long) lognum, + static_cast(bytes), + s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(dbname_, log); + SequentialFile* lfile; + Status status = env_->NewSequentialFile(logname, &lfile); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.lognum = log; + // We intentially make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(lfile, &reporter, false/*do not checksum*/); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable mem(icmp_); + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + status = WriteBatchInternal::InsertInto(&batch, &mem); + if (status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + Log(env_, options_.info_log, "Log #%llu: ignoring %s", + (unsigned long long) log, + status.ToString().c_str()); + status = Status::OK(); // Keep going with rest of file + } + } + delete lfile; + + // We ignore any version edits generated by the conversion to a Table + // since ExtractMetaData() will also generate edits. + VersionEdit skipped; + FileMetaData meta; + meta.number = next_file_number_++; + Iterator* iter = mem.NewIterator(); + status = BuildTable(dbname_, env_, options_, table_cache_, iter, + &meta, &skipped); + delete iter; + if (status.ok()) { + if (meta.file_size > 0) { + table_numbers_.push_back(meta.number); + } + } + Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", + (unsigned long long) log, + counter, + (unsigned long long) meta.number, + status.ToString().c_str()); + return status; + } + + void ExtractMetaData() { + std::vector kept; + for (int i = 0; i < table_numbers_.size(); i++) { + TableInfo t; + t.meta.number = table_numbers_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName(dbname_, table_numbers_[i]); + Log(env_, options_.info_log, "Table #%llu: ignoring %s", + (unsigned long long) table_numbers_[i], + status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName(dbname_, t->meta.number); + int counter = 0; + Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (status.ok()) { + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), t->meta.number, t->meta.file_size); + bool empty = true; + ParsedInternalKey parsed; + t->max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(env_, options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t->meta.number, + EscapeString(key).c_str()); + continue; + } + + counter++; + if (empty) { + empty = false; + t->meta.smallest.DecodeFrom(key); + } + t->meta.largest.DecodeFrom(key); + if (parsed.sequence > t->max_sequence) { + t->max_sequence = parsed.sequence; + } + + if (ExtractValueType(key) == kTypeLargeValueRef) { + if (iter->value().size() != LargeValueRef::ByteSize()) { + Log(env_, options_.info_log, "Table #%llu: bad large value ref", + (unsigned long long) t->meta.number); + } else { + edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()), + t->meta.number, + key); + } + } + } + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; + } + Log(env_, options_.info_log, "Table #%llu: %d entries %s", + (unsigned long long) t->meta.number, + counter, + status.ToString().c_str()); + return status; + } + + Status WriteDescriptor() { + std::string tmp = TempFileName(dbname_, 1); + WritableFile* file; + Status status = env_->NewWritableFile(tmp, &file); + if (!status.ok()) { + return status; + } + + SequenceNumber max_sequence = 0; + for (int i = 0; i < tables_.size(); i++) { + if (max_sequence < tables_[i].max_sequence) { + max_sequence = tables_[i].max_sequence; + } + } + + edit_.SetComparatorName(icmp_.user_comparator()->Name()); + edit_.SetLogNumber(0); + edit_.SetNextFile(next_file_number_); + edit_.SetLastSequence(max_sequence); + + for (int i = 0; i < tables_.size(); i++) { + // TODO(opt): separate out into multiple levels + const TableInfo& t = tables_[i]; + edit_.AddFile(0, t.meta.number, t.meta.file_size, + t.meta.smallest, t.meta.largest); + } + + //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); + { + log::Writer log(file); + std::string record; + edit_.EncodeTo(&record); + status = log.AddRecord(record); + } + if (status.ok()) { + status = file->Close(); + } + delete file; + file = NULL; + + if (!status.ok()) { + env_->DeleteFile(tmp); + } else { + // Discard older manifests + for (int i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + + // Install new manifest + status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); + if (status.ok()) { + status = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(tmp); + } + } + return status; + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != NULL) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == NULL) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + Log(env_, options_.info_log, "Archiving %s: %s\n", + fname.c_str(), s.ToString().c_str()); + } +}; +} + +Status RepairDB(const std::string& dbname, const Options& options) { + Repairer repairer(dbname, options); + return repairer.Run(); +} + +} diff --git a/db/skiplist.h b/db/skiplist.h new file mode 100644 index 0000000..be39354 --- /dev/null +++ b/db/skiplist.h @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... + +#include +#include +#include "port/port.h" +#include "util/arena.h" +#include "util/random.h" + +namespace leveldb { + +class Arena; + +template +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + enum { kMaxHeight = 12 }; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + inline int GetMaxHeight() const { + return reinterpret_cast(max_height_.NoBarrier_Load()); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return NULL if there is no such node. + // + // If prev is non-NULL, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template +struct SkipList::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template +typename SkipList::Node* +SkipList::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template +inline SkipList::Iterator::Iterator(const SkipList* list) { + list_ = list; + node_ = NULL; +} + +template +inline bool SkipList::Iterator::Valid() const { + return node_ != NULL; +} + +template +inline const Key& SkipList::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template +inline void SkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void SkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template +inline void SkipList::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, NULL); +} + +template +inline void SkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void SkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = NULL; + } +} + +template +int SkipList::RandomHeight() { + // Increase height with probability 1 in kBranching + static const unsigned int kBranching = 4; + int height = 1; + while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight); + return height; +} + +template +bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { + // NULL n is considered infinite + return (n != NULL) && (compare_(n->key, key) < 0); +} + +template +typename SkipList::Node* SkipList::FindGreaterOrEqual(const Key& key, Node** prev) + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != NULL) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template +typename SkipList::Node* +SkipList::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == NULL || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +typename SkipList::Node* SkipList::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == NULL) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +SkipList::SkipList(Comparator cmp, Arena* arena) + : compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, kMaxHeight)), + max_height_(reinterpret_cast(1)), + rnd_(0xdeadbeef) { + for (int i = 0; i < kMaxHeight; i++) { + head_->SetNext(i, NULL); + } +} + +template +void SkipList::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* prev[kMaxHeight]; + Node* x = FindGreaterOrEqual(key, prev); + + // Our data structure does not allow duplicate insertion + assert(x == NULL || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (NULL), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since NULL sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); + prev[i]->SetNext(i, x); + } +} + +template +bool SkipList::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, NULL); + if (x != NULL && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc new file mode 100644 index 0000000..5f9ec0d --- /dev/null +++ b/db/skiplist_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/skiplist.h" +#include +#include "leveldb/env.h" +#include "util/arena.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { + +typedef uint64_t Key; + +struct Comparator { + int operator()(const Key& a, const Key& b) const { + if (a < b) { + return -1; + } else if (a > b) { + return +1; + } else { + return 0; + } + } +}; + +class SkipTest { }; + +TEST(SkipTest, Empty) { + Arena arena; + Comparator cmp; + SkipList list(cmp, &arena); + ASSERT_TRUE(!list.Contains(10)); + + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + iter.Seek(100); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToLast(); + ASSERT_TRUE(!iter.Valid()); +} + +TEST(SkipTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + Arena arena; + Comparator cmp; + SkipList list(cmp, &arena); + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + list.Insert(key); + } + } + + for (int i = 0; i < R; i++) { + if (list.Contains(i)) { + ASSERT_EQ(keys.count(i), 1); + } else { + ASSERT_EQ(keys.count(i), 0); + } + } + + // Simple iterator tests + { + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + iter.Seek(0); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), iter.key()); + } + + // Forward iteration test + for (int i = 0; i < R; i++) { + SkipList::Iterator iter(&list); + iter.Seek(i); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + { + SkipList::Iterator iter(&list); + iter.SeekToLast(); + + // Compare against model iterator + for (std::set::reverse_iterator model_iter = keys.rbegin(); + model_iter != keys.rend(); + ++model_iter) { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + iter.Prev(); + } + ASSERT_TRUE(!iter.Valid()); + } +} + +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + private: + static const uint32_t K = 4; + + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = { k, g }; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + port::AtomicPointer generation[K]; + void Set(int k, intptr_t v) { + generation[k].Release_Store(reinterpret_cast(v)); + } + intptr_t Get(int k) { + return reinterpret_cast(generation[k].Acquire_Load()); + } + + State() { + for (int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + Arena arena_; + + // SkipList is not protected by mu_. We just use a single writer + // thread to modify it. + SkipList list_; + + public: + ConcurrentTest() : list_(Comparator(), &arena_) { } + + // REQUIRES: External synchronization + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const intptr_t g = current_.Get(k) + 1; + const Key key = MakeKey(k, g); + list_.Insert(key); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + SkipList::Iterator iter(&list_); + iter.Seek(pos); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = iter.key(); + ASSERT_TRUE(IsValidKey(current)) << std::hex << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << std::hex << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0) || + (gen(pos) > initial_state.Get(key(pos))) + ) << "key: " << key(pos) + << "; gen: " << gen(pos) + << "; initgen: " + << initial_state.Get(key(pos)); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(new_target); + } + } + } + } +}; +const uint32_t ConcurrentTest::K; + +// Simple test that does single-threaded testing of the ConcurrentTest +// scaffolding. +TEST(SkipTest, ConcurrentWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + test.WriteStep(&rnd); + } +} + +class TestState { + public: + ConcurrentTest t_; + int seed_; + port::AtomicPointer quit_flag_; + + enum ReaderState { + STARTING, + RUNNING, + DONE + }; + + explicit TestState(int s) + : seed_(s), + quit_flag_(NULL), + state_(STARTING), + state_cv_(&mu_) {} + + void Wait(ReaderState s) { + mu_.Lock(); + while (state_ != s) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + void Change(ReaderState s) { + mu_.Lock(); + state_ = s; + state_cv_.Signal(); + mu_.Unlock(); + } + + private: + port::Mutex mu_; + ReaderState state_; + port::CondVar state_cv_; +}; + +static void ConcurrentReader(void* arg) { + TestState* state = reinterpret_cast(arg); + Random rnd(state->seed_); + int64_t reads = 0; + state->Change(TestState::RUNNING); + while (!state->quit_flag_.Acquire_Load()) { + state->t_.ReadStep(&rnd); + ++reads; + } + state->Change(TestState::DONE); +} + +static void RunConcurrent(int run) { + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int i = 0; i < kSize; i++) { + state.t_.WriteStep(&rnd); + } + state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do + state.Wait(TestState::DONE); + } +} + +TEST(SkipTest, Concurrent1) { RunConcurrent(1); } +TEST(SkipTest, Concurrent2) { RunConcurrent(2); } +TEST(SkipTest, Concurrent3) { RunConcurrent(3); } +TEST(SkipTest, Concurrent4) { RunConcurrent(4); } +TEST(SkipTest, Concurrent5) { RunConcurrent(5); } + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/snapshot.h b/db/snapshot.h new file mode 100644 index 0000000..9a90756 --- /dev/null +++ b/db/snapshot.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ +#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ + +#include "leveldb/db.h" + +namespace leveldb { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each Snapshot corresponds to a particular sequence number. +class Snapshot { + public: + SequenceNumber number_; // const after creation + + private: + friend class SnapshotList; + + // Snapshot is kept in a doubly-linked circular list + Snapshot* prev_; + Snapshot* next_; + + SnapshotList* list_; // just for sanity checks +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + } + + bool empty() const { return list_.next_ == &list_; } + Snapshot* oldest() const { assert(!empty()); return list_.next_; } + Snapshot* newest() const { assert(!empty()); return list_.prev_; } + + const Snapshot* New(SequenceNumber seq) { + Snapshot* s = new Snapshot; + s->number_ = seq; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + return s; + } + + void Delete(const Snapshot* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + delete s; + } + + private: + // Dummy head of doubly-linked list of snapshots + Snapshot list_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/db/table_cache.cc b/db/table_cache.cc new file mode 100644 index 0000000..325d707 --- /dev/null +++ b/db/table_cache.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/filename.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "util/coding.h" + +namespace leveldb { + +struct TableAndFile { + RandomAccessFile* file; + Table* table; +}; + +static void DeleteEntry(const Slice& key, void* value) { + TableAndFile* tf = reinterpret_cast(value); + delete tf->table; + delete tf->file; + delete tf; +} + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +TableCache::TableCache(const std::string& dbname, + const Options* options, + int entries) + : env_(options->env), + dbname_(dbname), + options_(options), + cache_(NewLRUCache(entries)) { +} + +TableCache::~TableCache() { + delete cache_; +} + +Iterator* TableCache::NewIterator(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + Table** tableptr) { + if (tableptr != NULL) { + *tableptr = NULL; + } + + char buf[sizeof(file_number)]; + EncodeFixed64(buf, file_number); + Slice key(buf, sizeof(buf)); + Cache::Handle* handle = cache_->Lookup(key); + if (handle == NULL) { + std::string fname = TableFileName(dbname_, file_number); + RandomAccessFile* file = NULL; + Table* table = NULL; + Status s = env_->NewRandomAccessFile(fname, &file); + if (s.ok()) { + s = Table::Open(*options_, file, file_size, &table); + } + + if (!s.ok()) { + assert(table == NULL); + delete file; + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + return NewErrorIterator(s); + } + + TableAndFile* tf = new TableAndFile; + tf->file = file; + tf->table = table; + handle = cache_->Insert(key, tf, 1, &DeleteEntry); + } + + Table* table = reinterpret_cast(cache_->Value(handle))->table; + Iterator* result = table->NewIterator(options); + result->RegisterCleanup(&UnrefEntry, cache_, handle); + if (tableptr != NULL) { + *tableptr = table; + } + return result; +} + +void TableCache::Evict(uint64_t file_number) { + char buf[sizeof(file_number)]; + EncodeFixed64(buf, file_number); + cache_->Erase(Slice(buf, sizeof(buf))); +} + +} diff --git a/db/table_cache.h b/db/table_cache.h new file mode 100644 index 0000000..5376194 --- /dev/null +++ b/db/table_cache.h @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ +#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ + +#include +#include +#include "db/dbformat.h" +#include "leveldb/cache.h" +#include "leveldb/table.h" +#include "port/port.h" + +namespace leveldb { + +class Env; + +class TableCache { + public: + TableCache(const std::string& dbname, const Options* options, int entries); + ~TableCache(); + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "tableptr" is + // non-NULL, also sets "*tableptr" to point to the Table object + // underlying the returned iterator, or NULL if no Table object underlies + // the returned iterator. The returned "*tableptr" object is owned by + // the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + Iterator* NewIterator(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + Table** tableptr = NULL); + + // Evict any entry for the specified file number + void Evict(uint64_t file_number); + + private: + Env* const env_; + const std::string dbname_; + const Options* options_; + Cache* cache_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/db/version_edit.cc b/db/version_edit.cc new file mode 100644 index 0000000..689dbe0 --- /dev/null +++ b/db/version_edit.cc @@ -0,0 +1,301 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/version_set.h" +#include "util/coding.h" + +namespace leveldb { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. +enum Tag { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + kLargeValueRef = 8, + kPrevLogNumber = 9, +}; + +void VersionEdit::Clear() { + comparator_.clear(); + log_number_ = 0; + prev_log_number_ = 0; + last_sequence_ = 0; + next_file_number_ = 0; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_last_sequence_ = false; + deleted_files_.clear(); + new_files_.clear(); + large_refs_added_.clear(); +} + +void VersionEdit::EncodeTo(std::string* dst) const { + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32(dst, kLogNumber); + PutVarint64(dst, log_number_); + } + if (has_prev_log_number_) { + PutVarint32(dst, kPrevLogNumber); + PutVarint64(dst, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32(dst, kNextFileNumber); + PutVarint64(dst, next_file_number_); + } + if (has_last_sequence_) { + PutVarint32(dst, kLastSequence); + PutVarint64(dst, last_sequence_); + } + + for (int i = 0; i < compact_pointers_.size(); i++) { + PutVarint32(dst, kCompactPointer); + PutVarint32(dst, compact_pointers_[i].first); // level + PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); + } + + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + PutVarint32(dst, kDeletedFile); + PutVarint32(dst, iter->first); // level + PutVarint64(dst, iter->second); // file number + } + + for (int i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + PutVarint32(dst, kNewFile); + PutVarint32(dst, new_files_[i].first); // level + PutVarint64(dst, f.number); + PutVarint64(dst, f.file_size); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + } + + for (int i = 0; i < large_refs_added_.size(); i++) { + const VersionEdit::Large& l = large_refs_added_[i]; + PutVarint32(dst, kLargeValueRef); + PutLengthPrefixedSlice(dst, + Slice(l.large_ref.data, LargeValueRef::ByteSize())); + PutVarint64(dst, l.fnum); + PutLengthPrefixedSlice(dst, l.internal_key.Encode()); + } +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return true; + } else { + return false; + } +} + +static bool GetLevel(Slice* input, int* level) { + uint32_t v; + if (GetVarint32(input, &v) && + v < config::kNumLevels) { + *level = v; + return true; + } else { + return false; + } +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); + Slice input = src; + const char* msg = NULL; + uint32_t tag; + + // Temporary storage for parsing + int level; + uint64_t number; + FileMetaData f; + Slice str; + Large large; + InternalKey key; + + while (msg == NULL && GetVarint32(&input, &tag)) { + switch (tag) { + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactPointer: + if (GetLevel(&input, &level) && + GetInternalKey(&input, &key)) { + compact_pointers_.push_back(std::make_pair(level, key)); + } else { + msg = "compaction pointer"; + } + break; + + case kDeletedFile: + if (GetLevel(&input, &level) && + GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + msg = "deleted file"; + } + break; + + case kNewFile: + if (GetLevel(&input, &level) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + new_files_.push_back(std::make_pair(level, f)); + } else { + msg = "new-file entry"; + } + break; + + case kLargeValueRef: + if (GetLengthPrefixedSlice(&input, &str) && + (str.size() == LargeValueRef::ByteSize()) && + GetVarint64(&input, &large.fnum) && + GetInternalKey(&input, &large.internal_key)) { + large.large_ref = LargeValueRef::FromRef(str); + large_refs_added_.push_back(large); + } else { + msg = "large ref"; + } + break; + + default: + msg = "unknown tag"; + break; + } + } + + if (msg == NULL && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != NULL) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString() const { + std::string r; + r.append("VersionEdit {"); + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFile: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (int i = 0; i < compact_pointers_.size(); i++) { + r.append("\n CompactPointer: "); + AppendNumberTo(&r, compact_pointers_[i].first); + r.append(" '"); + AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); + r.append("'"); + } + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, iter->first); + r.append(" "); + AppendNumberTo(&r, iter->second); + } + for (int i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.number); + r.append(" "); + AppendNumberTo(&r, f.file_size); + r.append(" '"); + AppendEscapedStringTo(&r, f.smallest.Encode()); + r.append("' .. '"); + AppendEscapedStringTo(&r, f.largest.Encode()); + r.append("'"); + } + for (int i = 0; i < large_refs_added_.size(); i++) { + const VersionEdit::Large& l = large_refs_added_[i]; + r.append("\n LargeRef: "); + AppendNumberTo(&r, l.fnum); + r.append(" "); + r.append(LargeValueRefToFilenameString(l.large_ref)); + r.append(" '"); + AppendEscapedStringTo(&r, l.internal_key.Encode()); + r.append("'"); + } + r.append("\n}\n"); + return r; +} + +} diff --git a/db/version_edit.h b/db/version_edit.h new file mode 100644 index 0000000..7e417b5 --- /dev/null +++ b/db/version_edit.h @@ -0,0 +1,124 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ +#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ + +#include +#include +#include +#include "db/dbformat.h" + +namespace leveldb { + +class VersionSet; + +struct FileMetaData { + int refs; + uint64_t number; + uint64_t file_size; // File size in bytes + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + + FileMetaData() : refs(0), file_size(0) { } +}; + +class VersionEdit { + public: + VersionEdit() { Clear(); } + ~VersionEdit() { } + + void Clear(); + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + void SetCompactPointer(int level, const InternalKey& key) { + compact_pointers_.push_back(std::make_pair(level, key)); + } + + // Add the specified file at the specified number. + // REQUIRES: This version has not been saved (see VersionSet::SaveTo) + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + void AddFile(int level, uint64_t file, + uint64_t file_size, + const InternalKey& smallest, + const InternalKey& largest) { + FileMetaData f; + f.number = file; + f.file_size = file_size; + f.smallest = smallest; + f.largest = largest; + new_files_.push_back(std::make_pair(level, f)); + } + + // Delete the specified "file" from the specified "level". + void DeleteFile(int level, uint64_t file) { + deleted_files_.insert(std::make_pair(level, file)); + } + + // Record that a large value with the specified large_ref was + // written to the output file numbered "fnum" + void AddLargeValueRef(const LargeValueRef& large_ref, + uint64_t fnum, + const Slice& internal_key) { + large_refs_added_.resize(large_refs_added_.size() + 1); + Large* large = &(large_refs_added_.back()); + large->large_ref = large_ref; + large->fnum = fnum; + large->internal_key.DecodeFrom(internal_key); + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString() const; + + private: + friend class VersionSet; + + typedef std::set< std::pair > DeletedFileSet; + + std::string comparator_; + uint64_t log_number_; + uint64_t prev_log_number_; + uint64_t next_file_number_; + SequenceNumber last_sequence_; + bool has_comparator_; + bool has_log_number_; + bool has_prev_log_number_; + bool has_next_file_number_; + bool has_last_sequence_; + + std::vector< std::pair > compact_pointers_; + DeletedFileSet deleted_files_; + std::vector< std::pair > new_files_; + struct Large { + LargeValueRef large_ref; + uint64_t fnum; + InternalKey internal_key; + }; + std::vector large_refs_added_; +}; + +} + +#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc new file mode 100644 index 0000000..6906ec3 --- /dev/null +++ b/db/version_edit_test.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" +#include "util/testharness.h" + +namespace leveldb { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest { }; + +TEST(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); + edit.DeleteFile(4, kBig + 700 + i); + edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression), + kBig + 800 + i, "foobar"); + edit.AddLargeValueRef(LargeValueRef::Make("big2", kSnappyCompression), + kBig + 801 + i, "baz"); + edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/version_set.cc b/db/version_set.cc new file mode 100644 index 0000000..31f79bb --- /dev/null +++ b/db/version_set.cc @@ -0,0 +1,1120 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include +#include +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "leveldb/env.h" +#include "leveldb/table_builder.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +static const int kTargetFileSize = 2 * 1048576; + +// Maximum bytes of overlaps in grandparent (i.e., level+2) before we +// stop building a single file in a level->level+1 compaction. +static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; + +static double MaxBytesForLevel(int level) { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + double result = 10 * 1048576.0; // Result for both level-0 and level-1 + while (level > 1) { + result *= 10; + level--; + } + return result; +} + +static uint64_t MaxFileSizeForLevel(int level) { + return kTargetFileSize; // We could vary per level to reduce number of files? +} + +namespace { +std::string IntSetToString(const std::set& s) { + std::string result = "{"; + for (std::set::const_iterator it = s.begin(); + it != s.end(); + ++it) { + result += (result.size() > 1) ? "," : ""; + result += NumberToString(*it); + } + result += "}"; + return result; +} +} + +Version::~Version() { + assert(refs_ == 0); + for (int level = 0; level < config::kNumLevels; level++) { + for (int i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + assert(f->refs >= 0); + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + delete cleanup_mem_; +} + +// An internal iterator. For a given version/level pair, yields +// information about the files in the level. For a given entry, key() +// is the largest key that occurs in the file, and value() is an +// 16-byte value containing the file number and file size, both +// encoded using EncodeFixed64. +class Version::LevelFileNumIterator : public Iterator { + public: + LevelFileNumIterator(const Version* version, + const std::vector* flist) + : icmp_(version->vset_->icmp_.user_comparator()), + flist_(flist), + index_(flist->size()) { // Marks as invalid + } + virtual bool Valid() const { + return index_ < flist_->size(); + } + virtual void Seek(const Slice& target) { + uint32_t left = 0; + uint32_t right = flist_->size() - 1; + while (left < right) { + uint32_t mid = (left + right) / 2; + int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target); + if (cmp < 0) { + // Key at "mid.largest" is < than "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + index_ = left; + } + virtual void SeekToFirst() { index_ = 0; } + virtual void SeekToLast() { + index_ = flist_->empty() ? 0 : flist_->size() - 1; + } + virtual void Next() { + assert(Valid()); + index_++; + } + virtual void Prev() { + assert(Valid()); + if (index_ == 0) { + index_ = flist_->size(); // Marks as invalid + } else { + index_--; + } + } + Slice key() const { + assert(Valid()); + return (*flist_)[index_]->largest.Encode(); + } + Slice value() const { + assert(Valid()); + EncodeFixed64(value_buf_, (*flist_)[index_]->number); + EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); + return Slice(value_buf_, sizeof(value_buf_)); + } + virtual Status status() const { return Status::OK(); } + private: + const InternalKeyComparator icmp_; + const std::vector* const flist_; + int index_; + + // Backing store for value(). Holds the file number and size. + mutable char value_buf_[16]; +}; + +static Iterator* GetFileIterator(void* arg, + const ReadOptions& options, + const Slice& file_value) { + TableCache* cache = reinterpret_cast(arg); + if (file_value.size() != 16) { + return NewErrorIterator( + Status::Corruption("FileReader invoked with unexpected value")); + } else { + return cache->NewIterator(options, + DecodeFixed64(file_value.data()), + DecodeFixed64(file_value.data() + 8)); + } +} + +Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, + int level) const { + return NewTwoLevelIterator( + new LevelFileNumIterator(this, &files_[level]), + &GetFileIterator, vset_->table_cache_, options); +} + +void Version::AddIterators(const ReadOptions& options, + std::vector* iters) { + // Merge all level zero files together since they may overlap + for (int i = 0; i < files_[0].size(); i++) { + iters->push_back( + vset_->table_cache_->NewIterator( + options, files_[0][i]->number, files_[0][i]->file_size)); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < config::kNumLevels; level++) { + if (!files_[level].empty()) { + iters->push_back(NewConcatenatingIterator(options, level)); + } + } +} + +void Version::Ref() { + ++refs_; +} + +void Version::Unref() { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + vset_->MaybeDeleteOldVersions(); + // TODO: try to delete obsolete files + } +} + +std::string Version::DebugString() const { + std::string r; + for (int level = 0; level < config::kNumLevels; level++) { + // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g'] + r.append("level "); + AppendNumberTo(&r, level); + r.push_back(':'); + const std::vector& files = files_[level]; + for (int i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->number); + r.push_back(':'); + AppendNumberTo(&r, files[i]->file_size); + r.append("['"); + AppendEscapedStringTo(&r, files[i]->smallest.Encode()); + r.append("' .. '"); + AppendEscapedStringTo(&r, files[i]->largest.Encode()); + r.append("']"); + } + r.push_back('\n'); + } + return r; +} + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionSet::Builder { + private: + typedef std::map FileMap; + VersionSet* vset_; + FileMap files_[config::kNumLevels]; + + public: + // Initialize a builder with the files from *base and other info from *vset + Builder(VersionSet* vset, Version* base) + : vset_(vset) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = base->files_[level]; + for (int i = 0; i < files.size(); i++) { + FileMetaData* f = files[i]; + f->refs++; + files_[level].insert(std::make_pair(f->number, f)); + } + } + } + + ~Builder() { + for (int level = 0; level < config::kNumLevels; level++) { + const FileMap& fmap = files_[level]; + for (FileMap::const_iterator iter = fmap.begin(); + iter != fmap.end(); + ++iter) { + FileMetaData* f = iter->second; + f->refs--; + if (f->refs <= 0) { + delete f; + } + } + } + } + + // Apply all of the edits in *edit to the current state. + void Apply(VersionEdit* edit) { + // Update compaction pointers + for (int i = 0; i < edit->compact_pointers_.size(); i++) { + const int level = edit->compact_pointers_[i].first; + vset_->compact_pointer_[level] = + edit->compact_pointers_[i].second.Encode().ToString(); + } + + // Delete files + const VersionEdit::DeletedFileSet& del = edit->deleted_files_; + for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); + iter != del.end(); + ++iter) { + const int level = iter->first; + const uint64_t number = iter->second; + FileMap::iterator fiter = files_[level].find(number); + assert(fiter != files_[level].end()); // Sanity check for debug mode + if (fiter != files_[level].end()) { + FileMetaData* f = fiter->second; + f->refs--; + if (f->refs <= 0) { + delete f; + } + files_[level].erase(fiter); + } + } + + // Add new files + for (int i = 0; i < edit->new_files_.size(); i++) { + const int level = edit->new_files_[i].first; + FileMetaData* f = new FileMetaData(edit->new_files_[i].second); + f->refs = 1; + assert(files_[level].count(f->number) == 0); + files_[level].insert(std::make_pair(f->number, f)); + } + + // Add large value refs + for (int i = 0; i < edit->large_refs_added_.size(); i++) { + const VersionEdit::Large& l = edit->large_refs_added_[i]; + vset_->RegisterLargeValueRef(l.large_ref, l.fnum, l.internal_key); + } + } + + // Save the current state in *v. + void SaveTo(Version* v) { + for (int level = 0; level < config::kNumLevels; level++) { + const FileMap& fmap = files_[level]; + for (FileMap::const_iterator iter = fmap.begin(); + iter != fmap.end(); + ++iter) { + FileMetaData* f = iter->second; + f->refs++; + v->files_[level].push_back(f); + } + } + } +}; + +VersionSet::VersionSet(const std::string& dbname, + const Options* options, + TableCache* table_cache, + const InternalKeyComparator* cmp) + : env_(options->env), + dbname_(dbname), + options_(options), + table_cache_(table_cache), + icmp_(*cmp), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + last_sequence_(0), + log_number_(0), + prev_log_number_(0), + descriptor_file_(NULL), + descriptor_log_(NULL), + current_(new Version(this)), + oldest_(current_) { +} + +VersionSet::~VersionSet() { + for (Version* v = oldest_; v != NULL; ) { + Version* next = v->next_; + assert(v->refs_ == 0); + delete v; + v = next; + } + delete descriptor_log_; + delete descriptor_file_; +} + +Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { + if (edit->has_log_number_) { + assert(edit->log_number_ >= log_number_); + assert(edit->log_number_ < next_file_number_); + } else { + edit->SetLogNumber(log_number_); + } + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + + edit->SetNextFile(next_file_number_); + edit->SetLastSequence(last_sequence_); + + Version* v = new Version(this); + { + Builder builder(this, current_); + builder.Apply(edit); + builder.SaveTo(v); + } + + std::string new_manifest_file; + Status s = Finalize(v); + + // Initialize new descriptor log file if necessary by creating + // a temporary file that contains a snapshot of the current version. + if (s.ok()) { + if (descriptor_log_ == NULL) { + assert(descriptor_file_ == NULL); + new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); + edit->SetNextFile(next_file_number_); + s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); + if (s.ok()) { + descriptor_log_ = new log::Writer(descriptor_file_); + s = WriteSnapshot(descriptor_log_); + } + } + } + + // Write new record to MANIFEST log + if (s.ok()) { + std::string record; + edit->EncodeTo(&record); + s = descriptor_log_->AddRecord(record); + if (s.ok()) { + s = descriptor_file_->Sync(); + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && !new_manifest_file.empty()) { + s = SetCurrentFile(env_, dbname_, manifest_file_number_); + } + + // Install the new version + if (s.ok()) { + assert(current_->next_ == NULL); + assert(current_->cleanup_mem_ == NULL); + current_->cleanup_mem_ = cleanup_mem; + v->next_ = NULL; + current_->next_ = v; + current_ = v; + log_number_ = edit->log_number_; + prev_log_number_ = edit->prev_log_number_; + } else { + delete v; + if (!new_manifest_file.empty()) { + delete descriptor_log_; + delete descriptor_file_; + descriptor_log_ = NULL; + descriptor_file_ = NULL; + env_->DeleteFile(new_manifest_file); + } + } + + return s; +} + +Status VersionSet::Recover() { + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string current; + Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); + if (!s.ok()) { + return s; + } + if (current.empty() || current[current.size()-1] != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + current.resize(current.size() - 1); + + std::string dscname = dbname_ + "/" + current; + SequentialFile* file; + s = env_->NewSequentialFile(dscname, &file); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; + Builder builder(this, current_); + + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(file, &reporter, true/*checksum*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (s.ok()) { + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument( + edit.comparator_ + "does not match existing comparator ", + icmp_.user_comparator()->Name()); + } + } + + if (s.ok()) { + builder.Apply(&edit); + } + + if (edit.has_log_number_) { + log_number = edit.log_number_; + have_log_number = true; + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + delete file; + file = NULL; + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + } + + if (s.ok()) { + Version* v = new Version(this); + builder.SaveTo(v); + s = Finalize(v); + if (!s.ok()) { + delete v; + } else { + // Install recovered version + v->next_ = NULL; + current_->next_ = v; + current_ = v; + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + log_number_ = log_number; + prev_log_number_ = prev_log_number; + } + } + + return s; +} + +static int64_t TotalFileSize(const std::vector& files) { + int64_t sum = 0; + for (int i = 0; i < files.size(); i++) { + sum += files[i]->file_size; + } + return sum; +} + +Status VersionSet::Finalize(Version* v) { + // Precomputed best level for next compaction + int best_level = -1; + double best_score = -1; + + Status s; + for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { + s = SortLevel(v, level); + + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + score = v->files_[level].size() / 4.0; + } else { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = TotalFileSize(v->files_[level]); + score = static_cast(level_bytes) / MaxBytesForLevel(level); + } + + if (score > best_score) { + best_level = level; + best_score = score; + } + } + + v->compaction_level_ = best_level; + v->compaction_score_ = best_score; + return s; +} + +Status VersionSet::WriteSnapshot(log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // Save metadata + VersionEdit edit; + edit.SetComparatorName(icmp_.user_comparator()->Name()); + + // Save compaction pointers + for (int level = 0; level < config::kNumLevels; level++) { + if (!compact_pointer_[level].empty()) { + InternalKey key; + key.DecodeFrom(compact_pointer_[level]); + edit.SetCompactPointer(level, key); + } + } + + // Save files + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = current_->files_[level]; + for (int i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); + } + } + + // Save large value refs + for (LargeValueMap::const_iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ++it) { + const LargeValueRef& ref = it->first; + const LargeReferencesSet& pointers = it->second; + for (LargeReferencesSet::const_iterator j = pointers.begin(); + j != pointers.end(); + ++j) { + edit.AddLargeValueRef(ref, j->first, j->second); + } + } + + std::string record; + edit.EncodeTo(&record); + return log->AddRecord(record); +} + +// Helper to sort by tables_[file_number].smallest +struct VersionSet::BySmallestKey { + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + return internal_comparator->Compare(f1->smallest, f2->smallest) < 0; + } +}; + +Status VersionSet::SortLevel(Version* v, uint64_t level) { + Status result; + BySmallestKey cmp; + cmp.internal_comparator = &icmp_; + std::sort(v->files_[level].begin(), v->files_[level].end(), cmp); + + if (result.ok() && level > 0) { + // There should be no overlap + for (int i = 1; i < v->files_[level].size(); i++) { + const InternalKey& prev_end = v->files_[level][i-1]->largest; + const InternalKey& this_begin = v->files_[level][i]->smallest; + if (icmp_.Compare(prev_end, this_begin) >= 0) { + result = Status::Corruption( + "overlapping ranges in same level", + (EscapeString(prev_end.Encode()) + " vs. " + + EscapeString(this_begin.Encode()))); + break; + } + } + } + return result; +} + +int VersionSet::NumLevelFiles(int level) const { + assert(level >= 0); + assert(level < config::kNumLevels); + return current_->files_[level].size(); +} + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { + uint64_t result = 0; + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (int i = 0; i < files.size(); i++) { + if (icmp_.Compare(files[i]->largest, ikey) <= 0) { + // Entire file is before "ikey", so just add the file size + result += files[i]->file_size; + } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { + // Entire file is after "ikey", so ignore + if (level > 0) { + // Files other than level 0 are sorted by meta->smallest, so + // no further files in this level will contain data for + // "ikey". + break; + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + Table* tableptr; + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); + if (tableptr != NULL) { + result += tableptr->ApproximateOffsetOf(ikey.Encode()); + } + delete iter; + } + } + } + + // Add in large value files which are references from internal keys + // stored in the table files + // + // TODO(opt): this is O(# large values in db). If this becomes too slow, + // we could store an auxiliary data structure indexed by internal key + for (LargeValueMap::const_iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ++it) { + const LargeValueRef& lref = it->first; + for (LargeReferencesSet::const_iterator it2 = it->second.begin(); + it2 != it->second.end(); + ++it2) { + if (icmp_.Compare(it2->second, ikey.Encode()) <= 0) { + // Internal key for large value is before our key of interest + result += lref.ValueSize(); + } + } + } + + + return result; +} + +bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref, + uint64_t fnum, + const InternalKey& internal_key) { + LargeReferencesSet* refs = &large_value_refs_[large_ref]; + bool is_first = refs->empty(); + refs->insert(make_pair(fnum, internal_key.Encode().ToString())); + return is_first; +} + +void VersionSet::CleanupLargeValueRefs(const std::set& live_tables) { + for (LargeValueMap::iterator it = large_value_refs_.begin(); + it != large_value_refs_.end(); + ) { + LargeReferencesSet* refs = &it->second; + for (LargeReferencesSet::iterator ref_it = refs->begin(); + ref_it != refs->end(); + ) { + if (ref_it->first != log_number_ && // Not in log file + ref_it->first != prev_log_number_ && // Not in prev log + live_tables.count(ref_it->first) == 0) { // Not in a live table + // No longer live: erase + LargeReferencesSet::iterator to_erase = ref_it; + ++ref_it; + refs->erase(to_erase); + } else { + // Still live: leave this reference alone + ++ref_it; + } + } + if (refs->empty()) { + // No longer any live references to this large value: remove from + // large_value_refs + Log(env_, options_->info_log, "large value is dead: '%s'", + LargeValueRefToFilenameString(it->first).c_str()); + LargeValueMap::iterator to_erase = it; + ++it; + large_value_refs_.erase(to_erase); + } else { + ++it; + } + } +} + +bool VersionSet::LargeValueIsLive(const LargeValueRef& large_ref) { + LargeValueMap::iterator it = large_value_refs_.find(large_ref); + if (it == large_value_refs_.end()) { + return false; + } else { + assert(!it->second.empty()); + return true; + } +} + +void VersionSet::MaybeDeleteOldVersions() { + // Note: it is important to delete versions in order since a newer + // version with zero refs may be holding a pointer to a memtable + // that is used by somebody who has a ref on an older version. + while (oldest_ != current_ && oldest_->refs_ == 0) { + Version* next = oldest_->next_; + delete oldest_; + oldest_ = next; + } +} + +void VersionSet::AddLiveFiles(std::set* live) { + for (Version* v = oldest_; v != NULL; v = v->next_) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (int i = 0; i < files.size(); i++) { + live->insert(files[i]->number); + } + } + } +} + +int64_t VersionSet::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < config::kNumLevels); + return TotalFileSize(current_->files_[level]); +} + +int64_t VersionSet::MaxNextLevelOverlappingBytes() { + int64_t result = 0; + std::vector overlaps; + for (int level = 0; level < config::kNumLevels - 1; level++) { + for (int i = 0; i < current_->files_[level].size(); i++) { + const FileMetaData* f = current_->files_[level][i]; + GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); + const int64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +void VersionSet::GetOverlappingInputs( + int level, + const InternalKey& begin, + const InternalKey& end, + std::vector* inputs) { + inputs->clear(); + Slice user_begin = begin.user_key(); + Slice user_end = end.user_key(); + const Comparator* user_cmp = icmp_.user_comparator(); + for (int i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || + user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { + // Either completely before or after range; skip it + } else { + inputs->push_back(f); + } + } +} + +// Stores the minimal range that covers all entries in inputs in +// *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest) { + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + for (int i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_.Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_.Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } +} + +// Stores the minimal range that covers all entries in inputs1 and inputs2 +// in *smallest, *largest. +// REQUIRES: inputs is not empty +void VersionSet::GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + +Iterator* VersionSet::MakeInputIterator(Compaction* c) { + ReadOptions options; + options.verify_checksums = options_->paranoid_checks; + options.fill_cache = false; + + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); + Iterator** list = new Iterator*[space]; + int num = 0; + for (int which = 0; which < 2; which++) { + if (!c->inputs_[which].empty()) { + if (c->level() + which == 0) { + const std::vector& files = c->inputs_[which]; + for (int i = 0; i < files.size(); i++) { + list[num++] = table_cache_->NewIterator( + options, files[i]->number, files[i]->file_size); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = NewTwoLevelIterator( + new Version::LevelFileNumIterator( + c->input_version_, &c->inputs_[which]), + &GetFileIterator, table_cache_, options); + } + } + } + assert(num <= space); + Iterator* result = NewMergingIterator(&icmp_, list, num); + delete[] list; + return result; +} + +Compaction* VersionSet::PickCompaction() { + if (!NeedsCompaction()) { + return NULL; + } + const int level = current_->compaction_level_; + assert(level >= 0); + assert(level+1 < config::kNumLevels); + + Compaction* c = new Compaction(level); + c->input_version_ = current_; + c->input_version_->Ref(); + + // Pick the first file that comes after compact_pointer_[level] + for (int i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (compact_pointer_[level].empty() || + icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { + c->inputs_[0].push_back(f); + break; + } + } + if (c->inputs_[0].empty()) { + // Wrap-around to the beginning of the key space + c->inputs_[0].push_back(current_->files_[level][0]); + } + + // Files in level 0 may overlap each other, so pick up all overlapping ones + if (level == 0) { + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); + assert(!c->inputs_[0].empty()); + } + + SetupOtherInputs(c); + + return c; +} + +void VersionSet::SetupOtherInputs(Compaction* c) { + const int level = c->level(); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + + GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + + // See if we can grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. + if (!c->inputs_[1].empty()) { + std::vector expanded0; + GetOverlappingInputs(level, all_start, all_limit, &expanded0); + if (expanded0.size() > c->inputs_[0].size()) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); + if (expanded1.size() == c->inputs_[1].size()) { + Log(env_, options_->info_log, + "Expanding@%d %d+%d to %d+%d\n", + level, + int(c->inputs_[0].size()), + int(c->inputs_[1].size()), + int(expanded0.size()), + int(expanded1.size())); + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < config::kNumLevels) { + GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); + } + + if (false) { + Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", + level, + EscapeString(smallest.Encode()).c_str(), + EscapeString(largest.Encode()).c_str()); + } + + // Update the place where we will do the next compaction for this level. + // We update this immediately instead of waiting for the VersionEdit + // to be applied so that if the compaction fails, we will try a different + // key range next time. + compact_pointer_[level] = largest.Encode().ToString(); + c->edit_.SetCompactPointer(level, largest); +} + +Compaction* VersionSet::CompactRange( + int level, + const InternalKey& begin, + const InternalKey& end) { + std::vector inputs; + GetOverlappingInputs(level, begin, end, &inputs); + if (inputs.empty()) { + return NULL; + } + + Compaction* c = new Compaction(level); + c->input_version_ = current_; + c->input_version_->Ref(); + c->inputs_[0] = inputs; + SetupOtherInputs(c); + return c; +} + +Compaction::Compaction(int level) + : level_(level), + max_output_file_size_(MaxFileSizeForLevel(level)), + input_version_(NULL), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0) { + for (int i = 0; i < config::kNumLevels; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + if (input_version_ != NULL) { + input_version_->Unref(); + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + return (num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes); +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (int i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const InternalKey& key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != NULL) { + input_version_->Unref(); + input_version_ = NULL; + } +} + +} diff --git a/db/version_set.h b/db/version_set.h new file mode 100644 index 0000000..e1c5a4b --- /dev/null +++ b/db/version_set.h @@ -0,0 +1,332 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of Table files per level. The +// entire set of versions is maintained in a VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ +#define STORAGE_LEVELDB_DB_VERSION_SET_H_ + +#include +#include +#include +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/port.h" + +namespace leveldb { + +namespace log { class Writer; } + +class Compaction; +class Iterator; +class MemTable; +class TableBuilder; +class TableCache; +class Version; +class VersionSet; +class WritableFile; + +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, std::vector* iters); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + void Unref(); + + // Return a human readable string that describes this version's contents. + std::string DebugString() const; + + private: + friend class Compaction; + friend class VersionSet; + + class LevelFileNumIterator; + Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; + + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + int refs_; // Number of live refs to this version + MemTable* cleanup_mem_; // NULL, or table to delete when version dropped + + // List of files per level + std::vector files_[config::kNumLevels]; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + double compaction_score_; + int compaction_level_; + + explicit Version(VersionSet* vset) + : vset_(vset), next_(NULL), refs_(0), + cleanup_mem_(NULL), + compaction_score_(-1), + compaction_level_(-1) { + } + + ~Version(); + + // No copying allowed + Version(const Version&); + void operator=(const Version&); +}; + +class VersionSet { + public: + VersionSet(const std::string& dbname, + const Options* options, + TableCache* table_cache, + const InternalKeyComparator*); + ~VersionSet(); + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Iff Apply() returns OK, arrange to delete + // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed + // by older versions. + Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); + + // Recover the last saved descriptor from persistent storage. + Status Recover(); + + // Save current contents to *log + Status WriteSnapshot(log::Writer* log); + + // Return the current version. + Version* current() const { return current_; } + + // Return the current manifest file number + uint64_t ManifestFileNumber() const { return manifest_file_number_; } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_++; } + + // Return the number of Table files at the specified level. + int NumLevelFiles(int level) const; + + // Return the combined file size of all files at the specified level. + int64_t NumLevelBytes(int level) const; + + // Return the last sequence number. + uint64_t LastSequence() const { return last_sequence_; } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + last_sequence_ = s; + } + + // Return the current log file number. + uint64_t LogNumber() const { return log_number_; } + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t PrevLogNumber() const { return prev_log_number_; } + + // Pick level and inputs for a new compaction. + // Returns NULL if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + Compaction* PickCompaction(); + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns NULL if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + Compaction* CompactRange( + int level, + const InternalKey& begin, + const InternalKey& end); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + Iterator* MakeInputIterator(Compaction* c); + + // Returns true iff some level needs a compaction. + bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } + + // Add all files listed in any live version to *live. + // May also mutate some internal state. + void AddLiveFiles(std::set* live); + + // Return the approximate offset in the database of the data for + // "key" as of version "v". + uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); + + // Register a reference to a large value with the specified + // large_ref from the specified file number. Returns "true" if this + // is the first recorded reference to the "large_ref" value in the + // database, and false otherwise. + bool RegisterLargeValueRef(const LargeValueRef& large_ref, + uint64_t filenum, + const InternalKey& internal_key); + + // Cleanup the large value reference state by eliminating any + // references from files that are not includes in either "live_tables" + // or the current log. + void CleanupLargeValueRefs(const std::set& live_tables); + + // Returns true if a large value with the given reference is live. + bool LargeValueIsLive(const LargeValueRef& large_ref); + + private: + class Builder; + + friend class Compaction; + friend class Version; + + Status Finalize(Version* v); + + // Delete any old versions that are no longer needed. + void MaybeDeleteOldVersions(); + + struct BySmallestKey; + Status SortLevel(Version* v, uint64_t level); + + void GetOverlappingInputs( + int level, + const InternalKey& begin, + const InternalKey& end, + std::vector* inputs); + + void GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest); + + void GetRange2(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, + InternalKey* largest); + + void SetupOtherInputs(Compaction* c); + + Env* const env_; + const std::string dbname_; + const Options* const options_; + TableCache* const table_cache_; + const InternalKeyComparator icmp_; + uint64_t next_file_number_; + uint64_t manifest_file_number_; + uint64_t last_sequence_; + uint64_t log_number_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + // Opened lazily + WritableFile* descriptor_file_; + log::Writer* descriptor_log_; + + // Versions are kept in a singly linked list that is never empty + Version* current_; // Pointer to the last (newest) list entry + Version* oldest_; // Pointer to the first (oldest) list entry + + // Map from large value reference to the set of + // values containing references to the value. We keep the + // internal key as a std::string rather than as an InternalKey because + // we want to be able to easily use a set. + typedef std::set > LargeReferencesSet; + typedef std::map LargeValueMap; + LargeValueMap large_value_refs_; + + // Per-level key at which the next compaction at that level should start. + // Either an empty string, or a valid InternalKey. + std::string compact_pointer_[config::kNumLevels]; + + // No copying allowed + VersionSet(const VersionSet&); + void operator=(const VersionSet&); +}; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // and "level+1" will be merged to produce a set of "level+1" files. + int level() const { return level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "key". + bool ShouldStopBefore(const InternalKey& key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + private: + friend class Version; + friend class VersionSet; + + explicit Compaction(int level); + + int level_; + uint64_t max_output_file_size_; + Version* input_version_; + VersionEdit edit_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + int grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + int64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + + // State for implementing IsBaseLevelForKey + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + int level_ptrs_[config::kNumLevels]; +}; + +} + +#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/db/write_batch.cc b/db/write_batch.cc new file mode 100644 index 0000000..e84e548 --- /dev/null +++ b/db/write_batch.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring | +// kTypeLargeValueRef varstring varstring | +// kTypeDeletion varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "leveldb/write_batch.h" + +#include "leveldb/db.h" +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "util/coding.h" + +namespace leveldb { + +WriteBatch::WriteBatch() { + Clear(); +} + +WriteBatch::~WriteBatch() { } + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(12); +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +void WriteBatch::Put(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeValue)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatchInternal::PutLargeValueRef(WriteBatch* b, + const Slice& key, + const LargeValueRef& large_ref) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + b->rep_.push_back(static_cast(kTypeLargeValueRef)); + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, + Slice(large_ref.data, sizeof(large_ref.data))); +} + +void WriteBatch::Delete(const Slice& key) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeDeletion)); + PutLengthPrefixedSlice(&rep_, key); +} + +Status WriteBatchInternal::InsertInto(const WriteBatch* b, + MemTable* memtable) { + const int count = WriteBatchInternal::Count(b); + int found = 0; + Iterator it(*b); + for (; !it.Done(); it.Next()) { + switch (it.op()) { + case kTypeDeletion: + memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); + break; + case kTypeValue: + memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); + break; + case kTypeLargeValueRef: + memtable->Add(it.sequence_number(), kTypeLargeValueRef, + it.key(), it.value()); + break; + } + found++; + } + if (!it.status().ok()) { + return it.status(); + } else if (found != count) { + return Status::Corruption("wrong count in WriteBatch"); + } + return Status::OK(); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= 12); + b->rep_.assign(contents.data(), contents.size()); +} + +WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) + : input_(WriteBatchInternal::Contents(&batch)), + done_(false) { + if (input_.size() < 12) { + done_ = true; + } else { + seq_ = WriteBatchInternal::Sequence(&batch), + input_.remove_prefix(12); + GetNextEntry(); + } +} + +void WriteBatchInternal::Iterator::Next() { + assert(!done_); + seq_++; + GetNextEntry(); +} + +void WriteBatchInternal::Iterator::GetNextEntry() { + if (input_.empty()) { + done_ = true; + return; + } + char tag = input_[0]; + input_.remove_prefix(1); + switch (tag) { + case kTypeValue: + case kTypeLargeValueRef: + if (GetLengthPrefixedSlice(&input_, &key_) && + GetLengthPrefixedSlice(&input_, &value_)) { + op_ = static_cast(tag); + } else { + status_ = Status::Corruption("bad WriteBatch Put"); + done_ = true; + input_.clear(); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input_, &key_)) { + op_ = kTypeDeletion; + } else { + status_ = Status::Corruption("bad WriteBatch Delete"); + done_ = true; + input_.clear(); + } + break; + default: + status_ = Status::Corruption("unknown WriteBatch tag"); + done_ = true; + input_.clear(); + break; + } +} + +} diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h new file mode 100644 index 0000000..ea28e2d --- /dev/null +++ b/db/write_batch_internal.h @@ -0,0 +1,73 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ +#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ + +#include "leveldb/write_batch.h" + +namespace leveldb { + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + static void PutLargeValueRef(WriteBatch* batch, + const Slice& key, + const LargeValueRef& large_ref); + + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + // Return the seqeunce number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the seqeunce number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + static Status InsertInto(const WriteBatch* batch, MemTable* memtable); + + // Iterate over the contents of a write batch. + class Iterator { + public: + explicit Iterator(const WriteBatch& batch); + bool Done() const { return done_; } + void Next(); + ValueType op() const { return op_; } + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + SequenceNumber sequence_number() const { return seq_; } + Status status() const { return status_; } + + private: + void GetNextEntry(); + + Slice input_; + bool done_; + ValueType op_; + Slice key_; + Slice value_; + SequenceNumber seq_; + Status status_; + }; +}; + +} + + +#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc new file mode 100644 index 0000000..deb8411 --- /dev/null +++ b/db/write_batch_test.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" + +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "leveldb/env.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace leveldb { + +static std::string PrintContents(WriteBatch* b) { + InternalKeyComparator cmp(BytewiseComparator()); + MemTable mem(cmp); + std::string state; + Status s = WriteBatchInternal::InsertInto(b, &mem); + Iterator* iter = mem.NewIterator(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + break; + case kTypeLargeValueRef: + state.append("PutRef("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append("ParseError()"); + } + return state; +} + +class WriteBatchTest { }; + +TEST(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); +} + +TEST(WriteBatchTest, Multiple) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + batch.Put(Slice("baz"), Slice("boo")); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@102" + "Delete(box)@101" + "Put(foo, bar)@100", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, PutIndirect) { + WriteBatch batch; + batch.Put(Slice("baz"), Slice("boo")); + LargeValueRef h; + for (int i = 0; i < LargeValueRef::ByteSize(); i++) { + h.data[i] = (i < 20) ? 'a' : 'b'; + } + WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(2, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@100" + "PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, Corruption) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + WriteBatchInternal::SetContents(&batch, + Slice(contents.data(),contents.size()-1)); + ASSERT_EQ("Put(foo, bar)@200" + "ParseError()", + PrintContents(&batch)); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/doc/doc.css b/doc/doc.css new file mode 100644 index 0000000..700c564 --- /dev/null +++ b/doc/doc.css @@ -0,0 +1,89 @@ +body { + margin-left: 0.5in; + margin-right: 0.5in; + background: white; + color: black; +} + +h1 { + margin-left: -0.2in; + font-size: 14pt; +} +h2 { + margin-left: -0in; + font-size: 12pt; +} +h3 { + margin-left: -0in; +} +h4 { + margin-left: -0in; +} +hr { + margin-left: -0in; +} + +/* Definition lists: definition term bold */ +dt { + font-weight: bold; +} + +address { + text-align: center; +} +code,samp,var { + color: blue; +} +kbd { + color: #600000; +} +div.note p { + float: right; + width: 3in; + margin-right: 0%; + padding: 1px; + border: 2px solid #6060a0; + background-color: #fffff0; +} + +ul { + margin-top: -0em; + margin-bottom: -0em; +} + +ol { + margin-top: -0em; + margin-bottom: -0em; +} + +UL.nobullets { + list-style-type: none; + list-style-image: none; + margin-left: -1em; +} + +p { + margin: 1em 0 1em 0; + padding: 0 0 0 0; +} + +pre { + line-height: 1.3em; + padding: 0.4em 0 0.8em 0; + margin: 0 0 0 0; + border: 0 0 0 0; + color: blue; +} + +.datatable { + margin-left: auto; + margin-right: auto; + margin-top: 2em; + margin-bottom: 2em; + border: 1px solid; +} + +.datatable td,th { + padding: 0 0.5em 0 0.5em; + text-align: right; +} diff --git a/doc/impl.html b/doc/impl.html new file mode 100644 index 0000000..b190d2c --- /dev/null +++ b/doc/impl.html @@ -0,0 +1,228 @@ + + + + +Leveldb file layout and compactions + + + + +

Files

+ +The implementation of leveldb is similar in spirit to the +representation of a single + +Bigtable tablet (section 5.3). +However the organization of the files that make up the representation +is somewhat different and is explained below. + +

+Each database is represented by a set of file stored in a directory. +There are several different types of files as documented below: +

+

Log files

+

+A log file (*.log) stores a sequence of recent updates. Each update +is appended to the current log file. When the log file reaches a +pre-determined size (approximately 1MB by default), it is converted +to a sorted table (see below) and a new log file is created for future +updates. +

+A copy of the current log file is kept in an in-memory structure (the +memtable). This copy is consulted on every read so that read +operations reflect all logged updates. +

+

Sorted tables

+

+A sorted table (*.sst) stores a sequence of entries sorted by key. +Each entry is either a value for the key, or a deletion marker for the +key. (Deletion markers are kept around to hide obsolete values +present in older sorted tables). +

+The set of sorted tables are organized into a sequence of levels. The +sorted table generated from a log file is placed in a special young +level (also called level-0). When the number of young files exceeds a +certain threshold (currently four), all of the young files are merged +together with all of the overlapping level-1 files to produce a +sequence of new level-1 files (we create a new level-1 file for every +2MB of data.) +

+Files in the young level may contain overlapping keys. However files +in other levels have distinct non-overlapping key ranges. Consider +level number L where L >= 1. When the combined size of files in +level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, +...), one file in level-L, and all of the overlapping files in +level-(L+1) are merged to form a set of new files for level-(L+1). +These merges have the effect of gradually migrating new updates from +the young level to the largest level using only bulk reads and writes +(i.e., minimizing expensive seeks). + +

Large value files

+

+Each large value (greater than 64KB by default) is placed in a large +value file (*.val) of its own. An entry is maintained in the log +and/or sorted tables that maps from the corresponding key to the +name of this large value file. The name of the large value file +is derived from a SHA1 hash of the value and its length so that +identical values share the same file. +

+

Manifest

+

+A MANIFEST file lists the set of sorted tables that make up each +level, the corresponding key ranges, and other important metadata. +A new MANIFEST file (with a new number embedded in the file name) +is created whenever the database is reopened. The MANIFEST file is +formatted as a log, and changes made to the serving state (as files +are added or removed) are appended to this log. +

+

Current

+

+CURRENT is a simple text file that contains the name of the latest +MANIFEST file. +

+

Info logs

+

+Informational messages are printed to files named LOG and LOG.old. +

+

Others

+

+Other files used for miscellaneous purposes may also be present +(LOCK, *.dbtmp). + +

Level 0

+When the log file grows above a certain size (1MB by default): +
    +
  • Write the contents of the current memtable to an sstable +
  • Replace the current memtable by a brand new empty memtable +
  • Switch to a new log file +
  • Delete the old log file and the old memtable +
+Experimental measurements show that generating an sstable from a 1MB +log file takes ~12ms, which seems like an acceptable latency hiccup to +add infrequently to a log write. + +

+The new sstable is added to a special level-0 level. level-0 contains +a set of files (up to 4 by default). However unlike other levels, +these files do not cover disjoint ranges, but may overlap each other. + +

Compactions

+ +

+When the size of level L exceeds its limit, we compact it in a +background thread. The compaction picks a file from level L and all +overlapping files from the next level L+1. Note that if a level-L +file overlaps only part of a level-(L+1) file, the entire file at +level-(L+1) is used as an input to the compaction and will be +discarded after the compaction. Aside: because level-0 is special +(files in it may overlap each other), we treat compactions from +level-0 to level-1 specially: a level-0 compaction may pick more than +one level-0 file in case some of these files overlap each other. + +

+A compaction merges the contents of the picked files to produce a +sequence of level-(L+1) files. We switch to producing a new +level-(L+1) file after the current output file has reached the target +file size (2MB). We also switch to a new output file when the key +range of the current output file has grown enough to overlap more then +ten level-(L+2) files. This last rule ensures that a later compaction +of a level-(L+1) file will not pick up too much data from level-(L+2). + +

+The old files are discarded and the new files are added to the serving +state. + +

+Compactions for a particular level rotate through the key space. In +more detail, for each level L, we remember the ending key of the last +compaction at level L. The next compaction for level L will pick the +first file that starts after this key (wrapping around to the +beginning of the key space if there is no such file). + +

+Compactions drop overwritten values. They also drop deletion markers +if there are no higher numbered levels that contain a file whose range +overlaps the current key. + +

Timing

+ +Level-0 compactions will read up to four 1MB files from level-0, and +at worst all the level-1 files (10MB). I.e., we will read 14MB and +write 14MB. + +

+Other than the special level-0 compactions, we will pick one 2MB file +from level L. In the worst case, this will overlap ~ 12 files from +level L+1 (10 because level-(L+1) is ten times the size of level-L, +and another two at the boundaries since the file ranges at level-L +will usually not be aligned with the file ranges at level-L+1). The +compaction will therefore read 26MB and write 26MB. Assuming a disk +IO rate of 100MB/s (ballpark range for modern drives), the worst +compaction cost will be approximately 0.5 second. + +

+If we throttle the background writing to something small, say 10% of +the full 100MB/s speed, a compaction may take up to 5 seconds. If the +user is writing at 10MB/s, we might build up lots of level-0 files +(~50 to hold the 5*10MB). This may signficantly increase the cost of +reads due to the overhead of merging more files together on every +read. + +

+Solution 1: To reduce this problem, we might want to increase the log +switching threshold when the number of level-0 files is large. Though +the downside is that the larger this threshold, the larger the delay +that we will add to write latency when a write triggers a log switch. + +

+Solution 2: We might want to decrease write rate artificially when the +number of level-0 files goes up. + +

+Solution 3: We work on reducing the cost of very wide merges. +Perhaps most of the level-0 files will have their blocks sitting +uncompressed in the cache and we will only need to worry about the +O(N) complexity in the merging iterator. + +

Number of files

+ +Instead of always making 2MB files, we could make larger files for +larger levels to reduce the total file count, though at the expense of +more bursty compactions. Alternatively, we could shard the set of +files into multiple directories. + +

+An experiment on an ext3 filesystem on Feb 04, 2011 shows +the following timings to do 100K file opens in directories with +varying number of files: + + + + + +
Files in directoryMicroseconds to open a file
10009
1000010
10000016
+So maybe even the sharding is not necessary on modern filesystems? + +

Recovery

+ +
    +
  • Read CURRENT to find name of the latest committed MANIFEST +
  • Read the named MANIFEST file +
  • Clean up stale files +
  • We could open all sstables here, but it is probably better to be lazy... +
  • Convert log chunk to a new level-0 sstable +
  • Start directing new writes to a new log file with recovered sequence# +
+ +

Garbage collection of files

+ +DeleteObsoleteFiles() is called at the end of every +compaction and at the end of recovery. It finds the names of all +files in the database. It deletes all log files that are not the +current log file. It deletes all table files that are not referenced +from some level and are not the output of an active compaction. It +deletes all large value files that are not referenced from any live +table or log file. + + + diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 0000000..2a83fc3 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,509 @@ + + + + +Leveldb + + + +

Leveldb

+
Jeff Dean, Sanjay Ghemawat
+

+The leveldb library provides a persistent key value store. Keys and +values are arbitrary byte arrays. The keys are ordered within the key +value store according to a user-specified comparator function. + +

+

Opening A Database

+

+A leveldb database has a name which corresponds to a file system +directory. All of the contents of database are stored in this +directory. The following example shows how to open a database, +creating it if necessary: +

+

+  #include <assert>
+  #include "leveldb/include/db.h"
+
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+If you want to raise an error if the database already exists, add +the following line before the leveldb::DB::Open call: +
+  options.error_if_exists = true;
+
+

Status

+

+You may have noticed the leveldb::Status type above. Values of this +type are returned by most functions in leveldb that may encounter an +error. You can check if such a result is ok, and also print an +associated error message: +

+

+   leveldb::Status s = ...;
+   if (!s.ok()) cerr << s.ToString() << endl;
+
+

Closing A Database

+

+When you are done with a database, just delete the database object. +Example: +

+

+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+
+

Reads And Writes

+

+The database provides Put, Delete, and Get methods to +modify/query the database. For example, the following code +moves the value stored under key1 to key2. +

+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
+  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
+
+ +

Atomic Updates

+

+Note that if the process dies after the Put of key2 but before the +delete of key1, the same value may be left stored under multiple keys. +Such problems can be avoided by using the WriteBatch class to +atomically apply a set of updates: +

+

+  #include "leveldb/include/write_batch.h"
+  ...
+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) {
+    leveldb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db->Write(leveldb::WriteOptions(), &batch);
+  }
+
+The WriteBatch holds a sequence of edits to be made to the database, +and these edits within the batch are applied in order. Note that we +called Delete before Put so that if key1 is identical to key2, +we do not end up erroneously dropping the value entirely. +

+Apart from its atomicity benefits, WriteBatch may also be used to +speed up bulk updates by placing lots of individual mutations into the +same batch. + +

Synchronous Writes

+By default, each write to leveldb is asynchronous: it +returns after pushing the write from the process into the operating +system. The transfer from operating system memory to the underlying +persistent storage happens asynchronously. The sync flag +can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling +either fsync(...) or fdatasync(...) or +msync(..., MS_SYNC) before the write operation returns.) +
+  leveldb::WriteOptions write_options;
+  write_options.sync = true;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a thousand times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can often be used safely. For example, when +loading a large amount of data into the database you can handle lost +updates by restarting the bulk load after a crash. A hybrid scheme is +also possible where every Nth write is synchronous, and in the event +of a crash, the bulk load is restarted just after the last synchronous +write finished by the previous run. (The synchronous write can update +a marker that describes where to restart on a crash.) + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write (i.e., +write_options.sync is set to true). The extra cost of +the synchronous write will be amortized across all of the writes in +the batch. + +

+

Concurrency

+

+A database may only be opened by one process at a time. The leveldb +implementation acquires a lock from the operating system to prevent +misuse. Within a single process, the same leveldb::DB object may +be safely used by multiple concurrent threads. +

+

Iteration

+

+The following example demonstrates how to print all key,value pairs +in a database. +

+

+  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+  delete it;
+
+The following variation shows how to process just the keys in the +range [start,limit): +

+

+  for (it->Seek(start);
+       it->Valid() && it->key().ToString() < limit;
+       it->Next()) {
+    ...
+  }
+
+You can also process entries in reverse order. (Caveat: reverse +iteration may be somewhat slower than forward iteration.) +

+

+  for (it->SeekToLast(); it->Valid(); it->Prev()) {
+    ...
+  }
+
+

Snapshots

+

+Snapshots provide consistent read-only views over the entire state of +the key-value store. ReadOptions::snapshot may be non-NULL to indicate +that a read should operate on a particular version of the DB state. +If ReadOptions::snapshot is NULL, the read will operate on an +implicit snapshot of the current state. +

+Snapshots typically are created by the DB::GetSnapshot() method: +

+

+  leveldb::ReadOptions options;
+  options.snapshot = db->GetSnapshot();
+  ... apply some updates to db ...
+  leveldb::Iterator* iter = db->NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db->ReleaseSnapshot(options.snapshot);
+
+Note that when a snapshot is no longer needed, it should be released +using the DB::ReleaseSnapshot interface. This allows the +implementation to get rid of state that was being maintained just to +support reading as of that snapshot. +

+A Write operation can also return a snapshot that +represents the state of the database just after applying a particular +set of updates: +

+

+  leveldb::Snapshot* snapshot;
+  leveldb::WriteOptions write_options;
+  write_options.post_write_snapshot = &snapshot;
+  leveldb::Status status = db->Write(write_options, ...);
+  ... perform other mutations to db ...
+
+  leveldb::ReadOptions read_options;
+  read_options.snapshot = snapshot;
+  leveldb::Iterator* iter = db->NewIterator(read_options);
+  ... read as of the state just after the Write call returned ...
+  delete iter;
+
+  db->ReleaseSnapshot(snapshot);
+
+

Slice

+

+The return value of the it->key() and it->value() calls above +are instances of the leveldb::Slice type. Slice is a simple +structure that contains a length and a pointer to an external byte +array. Returning a Slice is a cheaper alternative to returning a +std::string since we do not need to copy potentially large keys and +values. In addition, leveldb methods do not return null-terminated +C-style strings since leveldb keys and values are allowed to +contain '\0' bytes. +

+C++ strings and null-terminated C-style strings can be easily converted +to a Slice: +

+

+   leveldb::Slice s1 = "hello";
+
+   std::string str("world");
+   leveldb::Slice s2 = str;
+
+A Slice can be easily converted back to a C++ string: +
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+
+Be careful when using Slices since it is up to the caller to ensure that +the external byte array into which the Slice points remains live while +the Slice is in use. For example, the following is buggy: +

+

+   leveldb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+
+When the if statement goes out of scope, str will be destroyed and the +backing storage for slice will disappear. +

+

Comparators

+

+The preceding examples used the default ordering function for key, +which orders bytes lexicographically. You can however supply a custom +comparator when opening a database. For example, suppose each +database key consists of two numbers and we should sort by the first +number, breaking ties by the second number. First, define a proper +subclass of leveldb::Comparator that expresses these rules: +

+

+  class TwoPartComparator : public leveldb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a < b: negative result
+    //   if a > b: positive result
+    //   else: zero result
+    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &a1, &a2);
+      ParseKey(b, &b1, &b2);
+      if (a1 < b1) return -1;
+      if (a1 > b1) return +1;
+      if (a2 < b2) return -1;
+      if (a2 > b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+
+Now create a database using this custom comparator: +

+

+  TwoPartComparator cmp;
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &cmp;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  ...
+
+

Backwards compatibility

+

+The result of the comparator's Name method is attached to the +database when it is created, and is checked on every subsequent +database open. If the name changes, the leveldb::DB::Open call will +fail. Therefore, change the name if and only if the new key format +and comparison function are incompatible with existing databases, and +it is ok to discard the contents of all existing databases. +

+You can however still gradually evolve your key format over time with +a little bit of pre-planning. For example, you could store a version +number at the end of each key (one byte should suffice for most uses). +When you wish to switch to a new key format (e.g., adding an optional +third part to the keys processed by TwoPartComparator), +(a) keep the same comparator name (b) increment the version number +for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. +

+

Performance

+

+Performance can be tuned by changing the default values of the +types defined in leveldb/include/options.h. + +

+

Block size

+

+leveldb groups adjacent keys together into the same block and such a +block is the unit of transfer to and from persistent storage. The +default block size is approximately 4096 uncompressed bytes. +Applications that mostly do bulk scans over the contents of the +database may wish to increase this size. Applications that do a lot +of point reads of small values may wish to switch to a smaller block +size if performance measurements indicate an improvement. There isn't +much benefit in using blocks smaller than one kilobyte, or larger than +a few megabytes. Also note that compression will be more effective +with larger block sizes. +

+

Compression

+

+Each block is individually compressed before being written to +persistent storage. Compression is on by default since the default +compression method is very fast, and is automatically disabled for +uncompressible data. In rare cases, applications may want to disable +compression entirely, but should only do so if benchmarks show a +performance improvement: +

+

+  leveldb::Options options;
+  options.compression = leveldb::kNoCompression;
+  ... leveldb::DB::Open(options, name, ...) ....
+
+

Cache

+

+The contents of the database are stored in a set of files in the +filesystem and each file stores a sequence of compressed blocks. If +options.cache is non-NULL, it is used to cache frequently used +uncompressed block contents. +

+

+  #include "leveldb/include/cache.h"
+
+  leveldb::Options options;
+  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
+  leveldb::DB* db;
+  leveldb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.cache;
+
+Note that the cache holds uncompressed data, and therefore it should +be sized according to application level data sizes, without any +reduction from compression. (Caching of compressed blocks is left to +the operating system buffer cache, or any custom Env +implementation provided by the client.) +

+When performing a bulk read, the application may wish to disable +caching so that the data processed by the bulk read does not end up +displacing most of the cached contents. A per-iterator option can be +used to achieve this: +

+

+  leveldb::ReadOptions options;
+  options.fill_cache = false;
+  leveldb::Iterator* it = db->NewIterator(options);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ...
+  }
+
+

Key Layout

+

+Note that the unit of disk transfer and caching is a block. Adjacent +keys (according to the database sort order) will usually be placed in +the same block. Therefore the application can improve its performance +by placing keys that are accessed together near each other and placing +infrequently used keys in a separate region of the key space. +

+For example, suppose we are implementing a simple file system on top +of leveldb. The types of entries we might wish to store are: +

+

+   filename -> permission-bits, length, list of file_block_ids
+   file_block_id -> data
+
+We might want to prefix filename keys with one letter (say '/') and the +file_block_id keys with a different letter (say '0') so that scans +over just the metadata do not force us to fetch and cache bulky file +contents. +

+

Large Values

+

+leveldb has special treatment of large values (by default, a value +of length greater than or equal to 64K is considered large, though a +field in Options can be used to adjust this threshold). Each such +large value is placed in a separate operating system file, and the +normal database blocks just contain pointers to such files. +

+Furthermore, if the same large value occurs multiple times in a single +database, it will be stored just once. +

+

Checksums

+

+leveldb associates checksums with all data it stores in the file system. +There are two separate controls provided over how aggressively these +checksums are verified: +

+

    +
  • ReadOptions::verify_checksums may be set to true to force + checksum verification of all data that is read from the file system on + behalf of a particular read. By default, no such verification is + done. +

    +

  • Options::paranoid_checks may be set to true before opening a + database to make the database implementation raise an error as soon as + it detects an internal corruption. Depending on which portion of the + database has been corrupted, the error may be raised when the database + is opened, or later by another database operation. By default, + paranoid checking is off so that the database can be used even if + parts of its persistent storage have been corrupted. +

    + If a database is corrupted (perhaps it cannot be opened when + paranoid checking is turned on), the leveldb::RepairDB function + may be used to recover as much of the data as possible +

    +

+

Approximate Sizes

+

+The GetApproximateSizes method can used to get the approximate +number of bytes of file system space used by one or more key ranges. +

+

+   leveldb::Range ranges[2];
+   ranges[0] = leveldb::Range("a", "c");
+   ranges[1] = leveldb::Range("x", "z");
+   uint64_t sizes[2];
+   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
+
+The preceding call will set sizes[0] to the approximate number of +bytes of file system space used by the key range [a..c) and +sizes[1] to the approximate number of bytes used by the key range +[x..z). +

+

Environment

+

+All file operations (and other operating system calls) issued by the +leveldb implementation are routed through a leveldb::Env object. +Sophisticated clients may wish to provide their own Env +implementation to get better control. For example, an application may +introduce artificial delays in the file IO paths to limit the impact +of leveldb on other activities in the system. +

+

+  class SlowEnv : public leveldb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  leveldb::Options options;
+  options.env = &env;
+  Status s = leveldb::DB::Open(options, ...);
+
+

Porting

+

+leveldb may be ported to a new platform by providing platform +specific implementations of the types/methods/functions exported by +leveldb/port/port.h. See leveldb/port/port_example.h for more +details. +

+In addition, the new platform may need a new default leveldb::Env +implementation. See leveldb/util/env_posix.h for an example. + +

Other Information

+ +

+Details about the leveldb implementation may be found in +the following documents: +

+ + + diff --git a/doc/log_format.txt b/doc/log_format.txt new file mode 100644 index 0000000..3a0414b --- /dev/null +++ b/doc/log_format.txt @@ -0,0 +1,75 @@ +The log file contents are a sequence of 32KB blocks. The only +exception is that the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] + length: uint16 + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last six bytes of a block (since it +won't fit). Any leftover bytes here form the trailer, which must +consist entirely of zero bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new +non-zero length record is added, the writer must emit a FIRST record +(which contains zero bytes of user data) to fill up the trailing seven +bytes of the block and then emit all of the user data in subsequent +blocks. + +More types may be added in the future. Some Readers may skip record +types they do not understand, others may report that some data was +skipped. + +FULL == 1 +FIRST == 2 +MIDDLE == 3 +LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been +split into multiple fragments (typically because of block boundaries). +FIRST is the type of the first fragment of a user record, LAST is the +type of the last fragment of a user record, and MID is the type of all +interior fragments of a user record. + +Example: consider a sequence of user records: + A: length 1000 + B: length 97270 + C: length 8000 +A will be stored as a FULL record in the first block. + +B will be split into three fragments: first fragment occupies the rest +of the first block, second fragment occupies the entirety of the +second block, and the third fragment occupies a prefix of the third +block. This will leave six bytes free in the third block, which will +be left empty as the trailer. + +C will be stored as a FULL record in the fourth block. + +=================== + +Some benefits over the recordio format: + +(1) We do not need any heuristics for resyncing - just go to next +block boundary and scan. If there is a corruption, skip to the next +block. As a side-benefit, we do not get confused when part of the +contents of one log file are embedded as a record inside another log +file. + +(2) Splitting at approximate boundaries (e.g., for mapreduce) is +simple: find the next block boundary and skip records until we +hit a FULL or FIRST record. + +(3) We do not need extra buffering for large records. + +Some downsides compared to recordio format: + +(1) No packing of tiny records. This could be fixed by adding a new +record type, so it is a shortcoming of the current implementation, +not necessarily the format. + +(2) No compression. Again, this could be fixed by adding new record types. diff --git a/doc/table_format.txt b/doc/table_format.txt new file mode 100644 index 0000000..ad5aa4b --- /dev/null +++ b/doc/table_format.txt @@ -0,0 +1,61 @@ +File format +=========== + + + [data block 1] + [data block 2] + ... + [data block N] + [meta block 1] + ... + [meta block K] + [metaindex block] + [index block] + [Footer] (fixed size; starts at file_size - sizeof(Footer)) + + +The file contains internal pointers. Each such pointer is called +a BlockHandle and contains the following information: + offset: varint64 + size: varint64 + +(1) The sequence of key/value pairs in the file are stored in sorted +order and partitioned into a sequence of data blocks. These blocks +come one after another at the beginning of the file. Each data block +is formatted according to the code in block_builder.cc, and then +optionally compressed. + +(2) After the data blocks we store a bunch of meta blocks. The +supported meta block types are described below. More meta block types +may be added in the future. Each meta block is again formatted using +block_builder.cc and then optionally compressed. + +(3) A "metaindex" block. It contains one entry for every other meta +block where the key is the name of the meta block and the value is a +BlockHandle pointing to that meta block. + +(4) An "index" block. This block contains one entry per data block, +where the key is a string >= last key in that data block and before +the first key in the successive data block. The value is the +BlockHandle for the data block. + +(6) At the very end of the file is a fixed length footer that contains +the BlockHandle of the metaindex and index blocks as well as a magic number. + metaindex_handle: char[p]; // Block handle for metaindex + index_handle: char[q]; // Block handle for index + padding: char[40-p-q]; // 0 bytes to make fixed length + // (40==2*BlockHandle::kMaxEncodedLength) + magic: fixed64; // == 0xdb4775248b80fb57 + +"stats" Meta Block +------------------ + +This meta block contains a bunch of stats. The key is the name +of the statistic. The value contains the statistic. +TODO(postrelease): record following stats. + data size + index size + key size (uncompressed) + value size (uncompressed) + number of entries + number of data blocks diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h new file mode 100644 index 0000000..79196d1 --- /dev/null +++ b/include/leveldb/cache.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ +#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ + +#include +#include "leveldb/slice.h" + +namespace leveldb { + +class Cache; + +// Create a new cache with a fixed size capacity. This implementation +// of Cache uses a least-recently-used eviction policy. +extern Cache* NewLRUCache(size_t capacity); + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) = 0; + + // If the cache has no mapping for "key", returns NULL. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + private: + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; + + // No copying allowed + Cache(const Cache&); + void operator=(const Cache&); +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h new file mode 100644 index 0000000..4e00e4d --- /dev/null +++ b/include/leveldb/comparator.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ + +#include + +namespace leveldb { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. +class Comparator { + public: + virtual ~Comparator(); + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "leveldb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/include/leveldb/db.h b/include/leveldb/db.h new file mode 100644 index 0000000..f18ded3 --- /dev/null +++ b/include/leveldb/db.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ +#define STORAGE_LEVELDB_INCLUDE_DB_H_ + +#include +#include +#include "leveldb/iterator.h" +#include "leveldb/options.h" + +namespace leveldb { + +static const int kMajorVersion = 1; +static const int kMinorVersion = 1; + +struct Options; +struct ReadOptions; +struct WriteOptions; + +class Snapshot; +class WriteBatch; + +// Some internal types. Clients should ignore. +class WriteBatchInternal; + +struct Range { + Slice start; + Slice limit; + + Range(const Slice& s, const Slice& l) : start(s), limit(l) { } +}; + +// A DB is a persistent ordered map from keys to values. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores NULL in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, + const std::string& name, + DB** dbptr); + + DB() { } + virtual ~DB(); + + // Set the database entry for "key" to "value". Returns OK on success, + // and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; + + // Apply the specified updates to the database. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) = 0; + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // DB implementations can export properties about their state + // via this method. If "property" is a valid property understood by this + // DB implementation, fills "*value" with its current value and returns + // true. Otherwise returns false. + // + // + // Valid property names include: + // + // "leveldb.num-files-at-level" - return the number of files at level , + // where is an ASCII representation of a level number (e.g. "0"). + // "leveldb.stats" - returns a multi-line string that describes statistics + // about the internal operation of the DB. + virtual bool GetProperty(const Slice& property, std::string* value) = 0; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + // + // The results may not include the sizes of recently written data. + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) = 0; + + // Possible extensions: + // (1) Add a method to compact a range of keys + + private: + // No copying allowed + DB(const DB&); + void operator=(const DB&); +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options); + +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +Status RepairDB(const std::string& dbname, const Options& options); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/include/leveldb/env.h b/include/leveldb/env.h new file mode 100644 index 0000000..4b6e712 --- /dev/null +++ b/include/leveldb/env.h @@ -0,0 +1,290 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the leveldb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. + +#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ +#define STORAGE_LEVELDB_INCLUDE_ENV_H_ + +#include +#include +#include +#include +#include "leveldb/status.h" + +namespace leveldb { + +class FileLock; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; + +class Env { + public: + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to leveldb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores NULL in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores NULL in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // Arrange to run "(*function)(arg)" once in a background thread. + // + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + virtual void Schedule( + void (*function)(void* arg), + void* arg) = 0; + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Write an entry to the log file with the specified format. + virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). If an error was encountered, returns a + // non-OK status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() { } + virtual ~WritableFile(); + + virtual Status Append(const Slice& data) = 0; + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; + + private: + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + +// Log the specified data to *info_log if info_log is non-NULL. +extern void Log(Env* env, WritableFile* info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 3, 4))) +# endif + ; + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *target + explicit EnvWrapper(Env* target) : target_(target) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, SequentialFile** r) { + return target_->NewSequentialFile(f, r); + } + Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { + return target_->NewRandomAccessFile(f, r); + } + Status NewWritableFile(const std::string& f, WritableFile** r) { + return target_->NewWritableFile(f, r); + } + bool FileExists(const std::string& f) { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector* r) { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) { return target_->CreateDir(d); } + Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } + Status GetFileSize(const std::string& f, uint64_t* s) { + return target_->GetFileSize(f, s); + } + Status RenameFile(const std::string& s, const std::string& t) { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } + void Schedule(void (*f)(void*), void* a) { + return target_->Schedule(f, a); + } + void StartThread(void (*f)(void*), void* a) { + return target_->StartThread(f, a); + } + virtual Status GetTestDirectory(std::string* path) { + return target_->GetTestDirectory(path); + } + virtual void Logv(WritableFile* log, const char* format, va_list ap) { + return target_->Logv(log, format, ap); + } + uint64_t NowMicros() { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) { + target_->SleepForMicroseconds(micros); + } + private: + Env* target_; +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h new file mode 100644 index 0000000..1866fb5 --- /dev/null +++ b/include/leveldb/iterator.h @@ -0,0 +1,95 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. + +#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ +#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ + +#include "leveldb/slice.h" +#include "leveldb/status.h" + +namespace leveldb { + +class Iterator { + public: + Iterator(); + virtual ~Iterator(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + + // No copying allowed + Iterator(const Iterator&); + void operator=(const Iterator&); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/include/leveldb/options.h b/include/leveldb/options.h new file mode 100644 index 0000000..87d388e --- /dev/null +++ b/include/leveldb/options.h @@ -0,0 +1,208 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ +#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ + +#include + +namespace leveldb { + +class Cache; +class Comparator; +class Env; +class Snapshot; +class WritableFile; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1, +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be to written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + WritableFile* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. + // + // Default: 4MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Handle values larger than "large_value_threshold" bytes + // specially, by writing them into their own files (to avoid + // compaction overhead) and doing content-based elimination of + // duplicate values to save space. + // + // We recommend against changing this value. + // + // Default: 64K + size_t large_value_threshold; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + int block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // Create an Options object with default values for all fields. + Options(); +}; + +// Options that control read operations +struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: false + bool verify_checksums; + + // Should the data read for this iteration be cached in memory? + // Callers may wish to set this field to false for bulk scans. + // Default: true + bool fill_cache; + + // If "snapshot" is non-NULL, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is NULL, use an impliicit + // snapshot of the state at the beginning of this read operation. + // Default: NULL + const Snapshot* snapshot; + + ReadOptions() + : verify_checksums(false), + fill_cache(true), + snapshot(NULL) { + } +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fsync()". + // + // Default: false + bool sync; + + // If "post_write_snapshot" is non-NULL, and the write succeeds, + // *post_write_snapshot will be modified to point to a snapshot of + // the DB state immediately after this write. The caller must call + // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the + // snapshot is no longer needed. + // + // If "post_write_snapshot" is non-NULL, and the write fails, + // *post_write_snapshot will be set to NULL. + // + // Default: NULL + const Snapshot** post_write_snapshot; + + WriteOptions() + : sync(false), + post_write_snapshot(NULL) { + } +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h new file mode 100644 index 0000000..62cb894 --- /dev/null +++ b/include/leveldb/slice.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. + +#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ +#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ + +#include +#include +#include +#include + +namespace leveldb { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) { } + + // Create a slice that refers to data[0,n-1]. + Slice(const char* data, size_t n) : data_(data), size_(n) { } + + // Create a slice that refers to the contents of "s" + Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + Slice(const char* s) : data_(s), size_(strlen(s)) { } + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { data_ = ""; size_ = 0; } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + std::string ToString() const { return std::string(data_, size_); } + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_, x.data_, x.size_) == 0)); + } + + private: + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline int Slice::compare(const Slice& b) const { + const int min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +} + + +#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/include/leveldb/status.h b/include/leveldb/status.h new file mode 100644 index 0000000..47e3edf --- /dev/null +++ b/include/leveldb/status.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. + +#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ +#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ + +#include +#include +#include "leveldb/slice.h" + +namespace leveldb { + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, Slice()); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + private: + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + }; + Code code() const { return (state_ == NULL) ? kOk : state_->first; } + + Status(Code code, const Slice& msg, const Slice& msg2); + + typedef std::pair State; + State* state_; +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); +} +inline void Status::operator=(const Status& s) { + if (this != &s) { + delete state_; + state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); + } +} + +} + +#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/include/leveldb/table.h b/include/leveldb/table.h new file mode 100644 index 0000000..bd99176 --- /dev/null +++ b/include/leveldb/table.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ +#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ + +#include +#include "leveldb/iterator.h" + +namespace leveldb { + +class Block; +class BlockHandle; +struct Options; +class RandomAccessFile; +struct ReadOptions; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. +class Table { + public: + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to NULL and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, + RandomAccessFile* file, + uint64_t file_size, + Table** table); + + ~Table(); + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + Iterator* NewIterator(const ReadOptions&) const; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key) const; + + private: + struct Rep; + Rep* rep_; + + explicit Table(Rep* rep) { rep_ = rep; } + static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); + + // No copying allowed + Table(const Table&); + void operator=(const Table&); +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h new file mode 100644 index 0000000..49d2d51 --- /dev/null +++ b/include/leveldb/table_builder.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). + +#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ +#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ + +#include +#include "leveldb/options.h" +#include "leveldb/status.h" + +namespace leveldb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; + +class TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + TableBuilder(const Options& options, WritableFile* file); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~TableBuilder(); + + // Change the options used by this builder. Note: only some of the + // option fields can be changed after construction. If a field is + // not allowed to change dynamically and its value in the structure + // passed to the constructor is different from its value in the + // structure passed to this method, this method will return an error + // without changing any fields. + Status ChangeOptions(const Options& options); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value); + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // Return non-ok iff some error has been detected. + Status status() const; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish(); + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon(); + + // Number of calls to Add() so far. + uint64_t NumEntries() const; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const; + + private: + bool ok() const { return status().ok(); } + void WriteBlock(BlockBuilder* block, BlockHandle* handle); + + struct Rep; + Rep* rep_; + + // No copying allowed + TableBuilder(const TableBuilder&); + void operator=(const TableBuilder&); +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h new file mode 100644 index 0000000..3411952 --- /dev/null +++ b/include/leveldb/write_batch.h @@ -0,0 +1,49 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); + +#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ +#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ + +#include + +namespace leveldb { + +class Slice; + +class WriteBatch { + public: + WriteBatch(); + ~WriteBatch(); + + // Store the mapping "key->value" in the database. + void Put(const Slice& key, const Slice& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + void Delete(const Slice& key); + + // Clear all updates buffered in this batch. + void Clear(); + + private: + friend class WriteBatchInternal; + + std::string rep_; // See comment in write_batch.cc for the format of rep_ + + // Intentionally copyable +}; + +} + +#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/leveldb.gyp b/leveldb.gyp new file mode 100644 index 0000000..d10ac33 --- /dev/null +++ b/leveldb.gyp @@ -0,0 +1,327 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +{ + 'variables': { + 'use_snappy%': 0, + }, + 'target_defaults': { + 'defines': [ + 'LEVELDB_PLATFORM_CHROMIUM=1', + ], + 'include_dirs': [ + '.', + 'include/', + ], + 'conditions': [ + ['OS == "win"', { + 'include_dirs': [ + 'port/win', + ], + }], + ['use_snappy', { + 'defines': [ + 'USE_SNAPPY=1', + ], + }], + ], + }, + 'targets': [ + { + 'target_name': 'leveldb', + 'type': '<(library)', + 'dependencies': [ + # The base libary is a lightweight abstraction layer for things like + # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ + '../../base/base.gyp:base', + ], + 'conditions': [ + ['use_snappy', { + 'dependencies': [ + '../../third_party/snappy/snappy.gyp:snappy', + ], + }], + ], + 'direct_dependent_settings': { + 'include_dirs': [ + 'include/', + ], + }, + 'sources': [ + # Include and then exclude so that all files show up in IDEs, even if + # they don't build. + 'db/builder.cc', + 'db/builder.h', + 'db/db_impl.cc', + 'db/db_impl.h', + 'db/db_iter.cc', + 'db/db_iter.h', + 'db/filename.cc', + 'db/filename.h', + 'db/dbformat.cc', + 'db/dbformat.h', + 'db/log_format.h', + 'db/log_reader.cc', + 'db/log_reader.h', + 'db/log_writer.cc', + 'db/log_writer.h', + 'db/memtable.cc', + 'db/memtable.h', + 'db/repair.cc', + 'db/skiplist.h', + 'db/snapshot.h', + 'db/table_cache.cc', + 'db/table_cache.h', + 'db/version_edit.cc', + 'db/version_edit.h', + 'db/version_set.cc', + 'db/version_set.h', + 'db/write_batch.cc', + 'db/write_batch_internal.h', + 'include/leveldb/cache.h', + 'include/leveldb/comparator.h', + 'include/leveldb/db.h', + 'include/leveldb/env.h', + 'include/leveldb/iterator.h', + 'include/leveldb/options.h', + 'include/leveldb/slice.h', + 'include/leveldb/status.h', + 'include/leveldb/table.h', + 'include/leveldb/table_builder.h', + 'include/leveldb/write_batch.h', + 'port/port.h', + 'port/port_chromium.cc', + 'port/port_chromium.h', + 'port/port_example.h', + 'port/port_posix.cc', + 'port/port_posix.h', + 'port/sha1_portable.cc', + 'port/sha1_portable.h', + 'table/block.cc', + 'table/block.h', + 'table/block_builder.cc', + 'table/block_builder.h', + 'table/format.cc', + 'table/format.h', + 'table/iterator.cc', + 'table/iterator_wrapper.h', + 'table/merger.cc', + 'table/merger.h', + 'table/table.cc', + 'table/table_builder.cc', + 'table/two_level_iterator.cc', + 'table/two_level_iterator.h', + 'util/arena.cc', + 'util/arena.h', + 'util/cache.cc', + 'util/coding.cc', + 'util/coding.h', + 'util/comparator.cc', + 'util/crc32c.cc', + 'util/crc32c.h', + 'util/env.cc', + 'util/env_chromium.cc', + 'util/env_posix.cc', + 'util/hash.cc', + 'util/hash.h', + 'util/logging.cc', + 'util/logging.h', + 'util/mutexlock.h', + 'util/options.cc', + 'util/random.h', + 'util/status.cc', + ], + 'sources/': [ + ['exclude', '_(android|example|portable|posix)\\.cc$'], + ], + }, + { + 'target_name': 'leveldb_testutil', + 'type': '<(library)', + 'dependencies': [ + '../../base/base.gyp:base', + 'leveldb', + ], + 'export_dependent_settings': [ + # The tests use include directories from these projects. + '../../base/base.gyp:base', + 'leveldb', + ], + 'sources': [ + 'util/histogram.cc', + 'util/histogram.h', + 'util/testharness.cc', + 'util/testharness.h', + 'util/testutil.cc', + 'util/testutil.h', + ], + }, + { + 'target_name': 'leveldb_arena_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/arena_test.cc', + ], + }, + { + 'target_name': 'leveldb_cache_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/cache_test.cc', + ], + }, + { + 'target_name': 'leveldb_coding_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/coding_test.cc', + ], + }, + { + 'target_name': 'leveldb_corruption_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/corruption_test.cc', + ], + }, + { + 'target_name': 'leveldb_crc32c_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/crc32c_test.cc', + ], + }, + { + 'target_name': 'leveldb_db_bench', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/db_bench.cc', + ], + }, + { + 'target_name': 'leveldb_db_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/db_test.cc', + ], + }, + { + 'target_name': 'leveldb_dbformat_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/dbformat_test.cc', + ], + }, + { + 'target_name': 'leveldb_env_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'util/env_test.cc', + ], + }, + { + 'target_name': 'leveldb_filename_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/filename_test.cc', + ], + }, + { + 'target_name': 'leveldb_log_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/log_test.cc', + ], + }, + { + 'target_name': 'leveldb_sha1_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'port/sha1_test.cc', + ], + }, + { + 'target_name': 'leveldb_skiplist_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/skiplist_test.cc', + ], + }, + { + 'target_name': 'leveldb_table_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'table/table_test.cc', + ], + }, + { + 'target_name': 'leveldb_version_edit_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/version_edit_test.cc', + ], + }, + { + 'target_name': 'leveldb_write_batch_test', + 'type': 'executable', + 'dependencies': [ + 'leveldb_testutil', + ], + 'sources': [ + 'db/write_batch_test.cc', + ], + }, + ], +} + +# Local Variables: +# tab-width:2 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/leveldb/AUTHORS b/leveldb/AUTHORS deleted file mode 100644 index 27a9407..0000000 --- a/leveldb/AUTHORS +++ /dev/null @@ -1,8 +0,0 @@ -# Names should be added to this file like so: -# Name or Organization - -Google Inc. - -# Initial version authors: -Jeffrey Dean -Sanjay Ghemawat diff --git a/leveldb/LICENSE b/leveldb/LICENSE deleted file mode 100644 index 8e80208..0000000 --- a/leveldb/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2011 The LevelDB Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/leveldb/Makefile b/leveldb/Makefile deleted file mode 100644 index 43ac23d..0000000 --- a/leveldb/Makefile +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -CC = g++ - -# Uncomment one of the following to switch between debug and opt mode -#OPT = -O2 -DNDEBUG -OPT = -g2 - -CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -I./include -std=c++0x $(OPT) - -LDFLAGS=-lpthread - -LIBOBJECTS = \ - ./db/builder.o \ - ./db/db_impl.o \ - ./db/db_iter.o \ - ./db/filename.o \ - ./db/dbformat.o \ - ./db/log_reader.o \ - ./db/log_writer.o \ - ./db/memtable.o \ - ./db/repair.o \ - ./db/table_cache.o \ - ./db/version_edit.o \ - ./db/version_set.o \ - ./db/write_batch.o \ - ./port/port_posix.o \ - ./table/block.o \ - ./table/block_builder.o \ - ./table/format.o \ - ./table/iterator.o \ - ./table/merger.o \ - ./table/table.o \ - ./table/table_builder.o \ - ./table/two_level_iterator.o \ - ./util/arena.o \ - ./util/cache.o \ - ./util/coding.o \ - ./util/comparator.o \ - ./util/crc32c.o \ - ./util/env.o \ - ./util/env_posix.o \ - ./util/hash.o \ - ./util/histogram.o \ - ./util/logging.o \ - ./util/options.o \ - ./util/status.o - -TESTUTIL = ./util/testutil.o -TESTHARNESS = ./util/testharness.o $(TESTUTIL) - -TESTS = \ - arena_test \ - cache_test \ - coding_test \ - corruption_test \ - crc32c_test \ - db_test \ - dbformat_test \ - env_test \ - filename_test \ - log_test \ - skiplist_test \ - table_test \ - version_edit_test \ - write_batch_test - -PROGRAMS = db_bench $(TESTS) - -all: $(PROGRAMS) - -check: $(TESTS) - for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done - -clean: - rm -f $(PROGRAMS) */*.o - -db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) - $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ - -arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - -.cc.o: - $(CC) $(CFLAGS) $< -o $@ - -# TODO(gabor): dependencies for .o files -# TODO(gabor): Build library diff --git a/leveldb/README b/leveldb/README deleted file mode 100644 index 3618ade..0000000 --- a/leveldb/README +++ /dev/null @@ -1,51 +0,0 @@ -leveldb: A key-value store -Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) - -The code under this directory implements a system for maintaining a -persistent key/value store. - -See doc/index.html for more explanation. -See doc/impl.html for a brief overview of the implementation. - -The public interface is in include/*.h. Callers should not include or -rely on the details of any other header files in this package. Those -internal APIs may be changed without warning. - -Guide to header files: - -include/db.h - Main interface to the DB: Start here - -include/options.h - Control over the behavior of an entire database, and also - control over the behavior of individual reads and writes. - -include/comparator.h - Abstraction for user-specified comparison function. If you want - just bytewise comparison of keys, you can use the default comparator, - but clients can write their own comparator implementations if they - want custom ordering (e.g. to handle different character - encodings, etc.) - -include/iterator.h - Interface for iterating over data. You can get an iterator - from a DB object. - -include/write_batch.h - Interface for atomically applying multiple updates to a database. - -include/slice.h - A simple module for maintaining a pointer and a length into some - other byte array. - -include/status.h - Status is returned from many of the public interfaces and is used - to report success and various kinds of errors. - -include/env.h - Abstraction of the OS environment. A posix implementation of - this interface is in util/env_posix.cc - -include/table.h -include/table_builder.h - Lower-level modules that most clients probably won't use directly diff --git a/leveldb/TODO b/leveldb/TODO deleted file mode 100644 index ce81439..0000000 --- a/leveldb/TODO +++ /dev/null @@ -1,14 +0,0 @@ -ss -- Stats - -db -- Maybe implement DB::BulkDeleteForRange(start_key, end_key) - that would blow away files whose ranges are entirely contained - within [start_key..end_key]? For Chrome, deletion of obsolete - object stores, etc. can be done in the background anyway, so - probably not that important. - -api changes: -- Make it wrappable - -Faster Get implementation diff --git a/leveldb/db/builder.cc b/leveldb/db/builder.cc deleted file mode 100644 index 9f132d7..0000000 --- a/leveldb/db/builder.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/builder.h" - -#include "db/filename.h" -#include "db/dbformat.h" -#include "db/table_cache.h" -#include "db/version_edit.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" - -namespace leveldb { - -Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta, - VersionEdit* edit) { - Status s; - meta->file_size = 0; - iter->SeekToFirst(); - - std::string fname = TableFileName(dbname, meta->number); - if (iter->Valid()) { - WritableFile* file; - s = env->NewWritableFile(fname, &file); - if (!s.ok()) { - return s; - } - - TableBuilder* builder = new TableBuilder(options, file); - meta->smallest.DecodeFrom(iter->key()); - for (; iter->Valid(); iter->Next()) { - Slice key = iter->key(); - meta->largest.DecodeFrom(key); - builder->Add(key, iter->value()); - } - - // Finish and check for builder errors - if (s.ok()) { - s = builder->Finish(); - if (s.ok()) { - meta->file_size = builder->FileSize(); - assert(meta->file_size > 0); - } - } else { - builder->Abandon(); - } - delete builder; - - // Finish and check for file errors - if (s.ok()) { - s = file->Sync(); - } - if (s.ok()) { - s = file->Close(); - } - delete file; - file = NULL; - - if (s.ok()) { - // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), - meta->number, - meta->file_size); - s = it->status(); - delete it; - } - } - - // Check for input iterator errors - if (!iter->status().ok()) { - s = iter->status(); - } - - if (s.ok() && meta->file_size > 0) { - edit->AddFile(0, meta->number, meta->file_size, - meta->smallest, meta->largest); - } else { - env->DeleteFile(fname); - } - return s; -} - -} diff --git a/leveldb/db/builder.h b/leveldb/db/builder.h deleted file mode 100644 index 5dd17b6..0000000 --- a/leveldb/db/builder.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ -#define STORAGE_LEVELDB_DB_BUILDER_H_ - -#include "leveldb/status.h" - -namespace leveldb { - -struct Options; -struct FileMetaData; - -class Env; -class Iterator; -class TableCache; -class VersionEdit; - -// Build a Table file from the contents of *iter. The generated file -// will be named according to meta->number. On success, the rest of -// *meta will be filled with metadata about the generated table, and -// the file information will be added to *edit. If no data is present -// in *iter, meta->file_size will be set to zero, and no Table file -// will be produced. -extern Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta, - VersionEdit* edit); - -} - -#endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/leveldb/db/corruption_test.cc b/leveldb/db/corruption_test.cc deleted file mode 100644 index 12d176e..0000000 --- a/leveldb/db/corruption_test.cc +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include -#include -#include -#include -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "leveldb/write_batch.h" -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/log_format.h" -#include "db/version_set.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static const int kValueSize = 1000; - -class CorruptionTest { - public: - test::ErrorEnv env_; - Random rnd_; - std::string dbname_; - Cache* tiny_cache_; - Options options_; - DB* db_; - - CorruptionTest() : rnd_(test::RandomSeed()) { - tiny_cache_ = NewLRUCache(100); - options_.env = &env_; - dbname_ = test::TmpDir() + "/db_test"; - DestroyDB(dbname_, options_); - - db_ = NULL; - options_.create_if_missing = true; - Reopen(); - options_.create_if_missing = false; - } - - ~CorruptionTest() { - delete db_; - DestroyDB(dbname_, Options()); - delete tiny_cache_; - } - - Status TryReopen(Options* options = NULL) { - delete db_; - db_ = NULL; - Options opt = (options ? *options : options_); - opt.env = &env_; - opt.block_cache = tiny_cache_; - return DB::Open(opt, dbname_, &db_); - } - - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); - } - - void RepairDB() { - delete db_; - db_ = NULL; - ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); - } - - void Build(int n) { - std::string key_space, value_space; - WriteBatch batch; - for (int i = 0; i < n; i++) { - //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); - Slice key = Key(i, &key_space); - batch.Clear(); - batch.Put(key, Value(i, &value_space)); - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - } - } - - void Check(int min_expected, int max_expected) { - int next_expected = 0; - int missed = 0; - int bad_keys = 0; - int bad_values = 0; - int correct = 0; - std::string value_space; - Iterator* iter = db_->NewIterator(ReadOptions()); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - uint64_t key; - Slice in(iter->key()); - if (!ConsumeDecimalNumber(&in, &key) || - !in.empty() || - key < next_expected) { - bad_keys++; - continue; - } - missed += (key - next_expected); - next_expected = key + 1; - if (iter->value() != Value(key, &value_space)) { - bad_values++; - } else { - correct++; - } - } - delete iter; - - fprintf(stderr, - "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", - min_expected, max_expected, correct, bad_keys, bad_values, missed); - ASSERT_LE(min_expected, correct); - ASSERT_GE(max_expected, correct); - } - - void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { - // Pick file to corrupt - std::vector filenames; - ASSERT_OK(env_.GetChildren(dbname_, &filenames)); - uint64_t number; - FileType type; - std::vector candidates; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && - type == filetype) { - candidates.push_back(dbname_ + "/" + filenames[i]); - } - } - ASSERT_TRUE(!candidates.empty()) << filetype; - std::string fname = candidates[rnd_.Uniform(candidates.size())]; - - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - const char* msg = strerror(errno); - ASSERT_TRUE(false) << fname << ": " << msg; - } - - if (offset < 0) { - // Relative to end of file; make it absolute - if (-offset > sbuf.st_size) { - offset = 0; - } else { - offset = sbuf.st_size + offset; - } - } - if (offset > sbuf.st_size) { - offset = sbuf.st_size; - } - if (offset + bytes_to_corrupt > sbuf.st_size) { - bytes_to_corrupt = sbuf.st_size - offset; - } - - // Do it - std::string contents; - Status s = ReadFileToString(Env::Default(), fname, &contents); - ASSERT_TRUE(s.ok()) << s.ToString(); - for (int i = 0; i < bytes_to_corrupt; i++) { - contents[i + offset] ^= 0x80; - } - s = WriteStringToFile(Env::Default(), contents, fname); - ASSERT_TRUE(s.ok()) << s.ToString(); - } - - int Property(const std::string& name) { - std::string property; - int result; - if (db_->GetProperty(name, &property) && - sscanf(property.c_str(), "%d", &result) == 1) { - return result; - } else { - return -1; - } - } - - // Return the ith key - Slice Key(int i, std::string* storage) { - char buf[100]; - snprintf(buf, sizeof(buf), "%016d", i); - storage->assign(buf, strlen(buf)); - return Slice(*storage); - } - - // Return the value to associate with the specified key - Slice Value(int k, std::string* storage) { - Random r(k); - return test::RandomString(&r, kValueSize, storage); - } -}; - -TEST(CorruptionTest, Recovery) { - Build(100); - Check(100, 100); - Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record - Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block - Reopen(); - - // The 64 records in the first two log blocks are completely lost. - Check(36, 36); -} - -TEST(CorruptionTest, RecoverWriteError) { - env_.writable_file_error_ = true; - Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); -} - -TEST(CorruptionTest, NewFileErrorDuringWrite) { - // Do enough writing to force minor compaction - env_.writable_file_error_ = true; - const int num = 3 + (Options().write_buffer_size / kValueSize); - std::string value_storage; - Status s; - for (int i = 0; s.ok() && i < num; i++) { - WriteBatch batch; - batch.Put("a", Value(100, &value_storage)); - s = db_->Write(WriteOptions(), &batch); - } - ASSERT_TRUE(!s.ok()); - ASSERT_GE(env_.num_writable_file_errors_, 1); - env_.writable_file_error_ = false; - Reopen(); -} - -TEST(CorruptionTest, TableFile) { - Build(100); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); - - Corrupt(kTableFile, 100, 1); - Check(99, 99); -} - -TEST(CorruptionTest, TableFileIndexData) { - Build(10000); // Enough to build multiple Tables - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); - - Corrupt(kTableFile, -2000, 500); - Reopen(); - Check(5000, 9999); -} - -TEST(CorruptionTest, MissingDescriptor) { - Build(1000); - RepairDB(); - Reopen(); - Check(1000, 1000); -} - -TEST(CorruptionTest, SequenceNumberRecovery) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); - RepairDB(); - Reopen(); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v5", v); - // Write something. If sequence number was not recovered properly, - // it will be hidden by an earlier write. - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v6", v); - Reopen(); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v6", v); -} - -TEST(CorruptionTest, CorruptedDescriptor) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - - Corrupt(kDescriptorFile, 0, 1000); - Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); - - RepairDB(); - Reopen(); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("hello", v); -} - -TEST(CorruptionTest, CompactionInputError) { - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Force compactions by writing lots of values - Build(10000); - Check(10000, 10000); - dbi->TEST_CompactRange(0, "", "~"); - ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); -} - -TEST(CorruptionTest, CompactionInputErrorParanoid) { - Options options; - options.paranoid_checks = true; - options.write_buffer_size = 1048576; - Reopen(&options); - - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Write must eventually fail because of corrupted table - Status s; - std::string tmp1, tmp2; - for (int i = 0; i < 10000 && s.ok(); i++) { - s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); - } - ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; -} - -TEST(CorruptionTest, UnrelatedKeys) { - Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - Corrupt(kTableFile, 100, 1); - - std::string tmp1, tmp2; - ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); - ASSERT_EQ(Value(1000, &tmp2).ToString(), v); - dbi->TEST_CompactMemTable(); - ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); - ASSERT_EQ(Value(1000, &tmp2).ToString(), v); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/db_bench.cc b/leveldb/db/db_bench.cc deleted file mode 100644 index d1cbdc0..0000000 --- a/leveldb/db/db_bench.cc +++ /dev/null @@ -1,613 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include "db/db_impl.h" -#include "db/version_set.h" -#include "leveldb/cache.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/write_batch.h" -#include "port/port.h" -#include "util/crc32c.h" -#include "util/histogram.h" -#include "util/random.h" -#include "util/testutil.h" - -// Comma-separated list of operations to run in the specified order -// Actual benchmarks: -// fillseq -- write N values in sequential key order in async mode -// fillrandom -- write N values in random key order in async mode -// overwrite -- overwrite N values in random key order in async mode -// fillsync -- write N/100 values in random key order in sync mode -// fill100K -- write N/1000 100K values in random order in async mode -// readseq -- read N values sequentially -// readreverse -- read N values in reverse order -// readrandom -- read N values in random order -// crc32c -- repeated crc32c of 4K of data -// Meta operations: -// compact -- Compact the entire DB -// stats -- Print DB stats -// heapprofile -- Dump a heap profile (if supported by this port) -static const char* FLAGS_benchmarks = - "fillseq," - "fillsync," - "fillrandom," - "overwrite," - "readrandom," - "readrandom," // Extra run to allow previous compactions to quiesce - "readseq," - "readreverse," - "compact," - "readrandom," - "readseq," - "readreverse," - "fill100K," - "crc32c," - "snappycomp," - "snappyuncomp," - ; - -// Number of key/values to place in database -static int FLAGS_num = 1000000; - -// Size of each value -static int FLAGS_value_size = 100; - -// Arrange to generate values that shrink to this fraction of -// their original size after compression -static double FLAGS_compression_ratio = 0.5; - -// Print histogram of operation timings -static bool FLAGS_histogram = false; - -// Number of bytes to buffer in memtable before compacting -// (initialized to default value by "main") -static int FLAGS_write_buffer_size = 0; - -// Number of bytes to use as a cache of uncompressed data. -// Negative means use default settings. -static int FLAGS_cache_size = -1; - -namespace leveldb { - -// Helper for quickly generating random data. -namespace { -class RandomGenerator { - private: - std::string data_; - int pos_; - - public: - RandomGenerator() { - // We use a limited amount of data over and over again and ensure - // that it is larger than the compression window (32KB), and also - // large enough to serve all typical value sizes we want to write. - Random rnd(301); - std::string piece; - while (data_.size() < 1048576) { - // Add a short fragment that is as compressible as specified - // by FLAGS_compression_ratio. - test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); - data_.append(piece); - } - pos_ = 0; - } - - Slice Generate(int len) { - if (pos_ + len > data_.size()) { - pos_ = 0; - assert(len < data_.size()); - } - pos_ += len; - return Slice(data_.data() + pos_ - len, len); - } -}; - -static Slice TrimSpace(Slice s) { - int start = 0; - while (start < s.size() && isspace(s[start])) { - start++; - } - int limit = s.size(); - while (limit > start && isspace(s[limit-1])) { - limit--; - } - return Slice(s.data() + start, limit - start); -} - -} - -class Benchmark { - private: - Cache* cache_; - DB* db_; - int num_; - int heap_counter_; - double start_; - double last_op_finish_; - int64_t bytes_; - std::string message_; - std::string post_message_; - Histogram hist_; - RandomGenerator gen_; - Random rand_; - - // State kept for progress messages - int done_; - int next_report_; // When to report next - - void PrintHeader() { - const int kKeySize = 16; - PrintEnvironment(); - fprintf(stdout, "Keys: %d bytes each\n", kKeySize); - fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", - FLAGS_value_size, - static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); - fprintf(stdout, "Entries: %d\n", num_); - fprintf(stdout, "RawSize: %.1f MB (estimated)\n", - ((static_cast(kKeySize + FLAGS_value_size) * num_) - / 1048576.0)); - fprintf(stdout, "FileSize: %.1f MB (estimated)\n", - (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) - / 1048576.0)); - PrintWarnings(); - fprintf(stdout, "------------------------------------------------\n"); - } - - void PrintWarnings() { -#if defined(__GNUC__) && !defined(__OPTIMIZE__) - fprintf(stdout, - "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" - ); -#endif -#ifndef NDEBUG - fprintf(stdout, - "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); -#endif - - // See if snappy is working by attempting to compress a compressible string - const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; - std::string compressed; - if (!port::Snappy_Compress(text, sizeof(text), &compressed)) { - fprintf(stdout, "WARNING: Snappy compression is not enabled\n"); - } else if (compressed.size() >= sizeof(text)) { - fprintf(stdout, "WARNING: Snappy compression is not effective\n"); - } - } - - void PrintEnvironment() { - fprintf(stderr, "LevelDB: version %d.%d\n", - kMajorVersion, kMinorVersion); - -#if defined(__linux) - time_t now = time(NULL); - fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline - - FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); - if (cpuinfo != NULL) { - char line[1000]; - int num_cpus = 0; - std::string cpu_type; - std::string cache_size; - while (fgets(line, sizeof(line), cpuinfo) != NULL) { - const char* sep = strchr(line, ':'); - if (sep == NULL) { - continue; - } - Slice key = TrimSpace(Slice(line, sep - 1 - line)); - Slice val = TrimSpace(Slice(sep + 1)); - if (key == "model name") { - ++num_cpus; - cpu_type = val.ToString(); - } else if (key == "cache size") { - cache_size = val.ToString(); - } - } - fclose(cpuinfo); - fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); - fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); - } -#endif - } - - void Start() { - start_ = Env::Default()->NowMicros() * 1e-6; - bytes_ = 0; - message_.clear(); - last_op_finish_ = start_; - hist_.Clear(); - done_ = 0; - next_report_ = 100; - } - - void FinishedSingleOp() { - if (FLAGS_histogram) { - double now = Env::Default()->NowMicros() * 1e-6; - double micros = (now - last_op_finish_) * 1e6; - hist_.Add(micros); - if (micros > 20000) { - fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); - fflush(stderr); - } - last_op_finish_ = now; - } - - done_++; - if (done_ >= next_report_) { - if (next_report_ < 1000) next_report_ += 100; - else if (next_report_ < 5000) next_report_ += 500; - else if (next_report_ < 10000) next_report_ += 1000; - else if (next_report_ < 50000) next_report_ += 5000; - else if (next_report_ < 100000) next_report_ += 10000; - else if (next_report_ < 500000) next_report_ += 50000; - else next_report_ += 100000; - fprintf(stderr, "... finished %d ops%30s\r", done_, ""); - fflush(stderr); - } - } - - void Stop(const Slice& name) { - double finish = Env::Default()->NowMicros() * 1e-6; - - // Pretend at least one op was done in case we are running a benchmark - // that does nto call FinishedSingleOp(). - if (done_ < 1) done_ = 1; - - if (bytes_ > 0) { - char rate[100]; - snprintf(rate, sizeof(rate), "%6.1f MB/s", - (bytes_ / 1048576.0) / (finish - start_)); - if (!message_.empty()) { - message_ = std::string(rate) + " " + message_; - } else { - message_ = rate; - } - } - - fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", - name.ToString().c_str(), - (finish - start_) * 1e6 / done_, - (message_.empty() ? "" : " "), - message_.c_str()); - if (FLAGS_histogram) { - fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); - } - fflush(stdout); - - if (!post_message_.empty()) { - fprintf(stdout, "\n%s\n", post_message_.c_str()); - post_message_.clear(); - } - } - - public: - enum Order { - SEQUENTIAL, - RANDOM - }; - enum DBState { - FRESH, - EXISTING - }; - - Benchmark() - : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), - db_(NULL), - num_(FLAGS_num), - heap_counter_(0), - bytes_(0), - rand_(301) { - std::vector files; - Env::Default()->GetChildren("/tmp/dbbench", &files); - for (int i = 0; i < files.size(); i++) { - if (Slice(files[i]).starts_with("heap-")) { - Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); - } - } - DestroyDB("/tmp/dbbench", Options()); - } - - ~Benchmark() { - delete db_; - delete cache_; - } - - void Run() { - PrintHeader(); - Open(); - - const char* benchmarks = FLAGS_benchmarks; - while (benchmarks != NULL) { - const char* sep = strchr(benchmarks, ','); - Slice name; - if (sep == NULL) { - name = benchmarks; - benchmarks = NULL; - } else { - name = Slice(benchmarks, sep - benchmarks); - benchmarks = sep + 1; - } - - Start(); - - WriteOptions write_options; - bool known = true; - if (name == Slice("fillseq")) { - Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); - } else if (name == Slice("fillbatch")) { - Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000); - } else if (name == Slice("fillrandom")) { - Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1); - } else if (name == Slice("overwrite")) { - Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1); - } else if (name == Slice("fillsync")) { - write_options.sync = true; - Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); - } else if (name == Slice("fill100K")) { - Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); - } else if (name == Slice("readseq")) { - ReadSequential(); - } else if (name == Slice("readreverse")) { - ReadReverse(); - } else if (name == Slice("readrandom")) { - ReadRandom(); - } else if (name == Slice("readrandomsmall")) { - int n = num_; - num_ /= 1000; - ReadRandom(); - num_ = n; - } else if (name == Slice("compact")) { - Compact(); - } else if (name == Slice("crc32c")) { - Crc32c(4096, "(4K per op)"); - } else if (name == Slice("snappycomp")) { - SnappyCompress(); - } else if (name == Slice("snappyuncomp")) { - SnappyUncompress(); - } else if (name == Slice("heapprofile")) { - HeapProfile(); - } else if (name == Slice("stats")) { - PrintStats(); - } else { - known = false; - if (name != Slice()) { // No error message for empty name - fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); - } - } - if (known) { - Stop(name); - } - } - } - - private: - void Crc32c(int size, const char* label) { - // Checksum about 500MB of data total - std::string data(size, 'x'); - int64_t bytes = 0; - uint32_t crc = 0; - while (bytes < 500 * 1048576) { - crc = crc32c::Value(data.data(), size); - FinishedSingleOp(); - bytes += size; - } - // Print so result is not dead - fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); - - bytes_ = bytes; - message_ = label; - } - - void SnappyCompress() { - Slice input = gen_.Generate(Options().block_size); - int64_t bytes = 0; - int64_t produced = 0; - bool ok = true; - std::string compressed; - while (ok && bytes < 1024 * 1048576) { // Compress 1G - ok = port::Snappy_Compress(input.data(), input.size(), &compressed); - produced += compressed.size(); - bytes += input.size(); - FinishedSingleOp(); - } - - if (!ok) { - message_ = "(snappy failure)"; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "(output: %.1f%%)", - (produced * 100.0) / bytes); - message_ = buf; - bytes_ = bytes; - } - } - - void SnappyUncompress() { - Slice input = gen_.Generate(Options().block_size); - std::string compressed; - bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); - int64_t bytes = 0; - std::string uncompressed; - while (ok && bytes < 1024 * 1048576) { // Compress 1G - ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), - &uncompressed); - bytes += uncompressed.size(); - FinishedSingleOp(); - } - - if (!ok) { - message_ = "(snappy failure)"; - } else { - bytes_ = bytes; - } - } - - void Open() { - assert(db_ == NULL); - Options options; - options.create_if_missing = true; - options.block_cache = cache_; - options.write_buffer_size = FLAGS_write_buffer_size; - Status s = DB::Open(options, "/tmp/dbbench", &db_); - if (!s.ok()) { - fprintf(stderr, "open error: %s\n", s.ToString().c_str()); - exit(1); - } - } - - void Write(const WriteOptions& options, Order order, DBState state, - int num_entries, int value_size, int entries_per_batch) { - if (state == FRESH) { - delete db_; - db_ = NULL; - DestroyDB("/tmp/dbbench", Options()); - Open(); - Start(); // Do not count time taken to destroy/open - } - - if (num_entries != num_) { - char msg[100]; - snprintf(msg, sizeof(msg), "(%d ops)", num_entries); - message_ = msg; - } - - WriteBatch batch; - Status s; - std::string val; - for (int i = 0; i < num_entries; i += entries_per_batch) { - batch.Clear(); - for (int j = 0; j < entries_per_batch; j++) { - const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num); - char key[100]; - snprintf(key, sizeof(key), "%016d", k); - batch.Put(key, gen_.Generate(value_size)); - bytes_ += value_size + strlen(key); - FinishedSingleOp(); - } - s = db_->Write(options, &batch); - if (!s.ok()) { - fprintf(stderr, "put error: %s\n", s.ToString().c_str()); - exit(1); - } - } - } - - void ReadSequential() { - Iterator* iter = db_->NewIterator(ReadOptions()); - int i = 0; - for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - } - - void ReadReverse() { - Iterator* iter = db_->NewIterator(ReadOptions()); - int i = 0; - for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { - bytes_ += iter->key().size() + iter->value().size(); - FinishedSingleOp(); - ++i; - } - delete iter; - } - - void ReadRandom() { - ReadOptions options; - std::string value; - for (int i = 0; i < num_; i++) { - char key[100]; - const int k = rand_.Next() % FLAGS_num; - snprintf(key, sizeof(key), "%016d", k); - db_->Get(options, key, &value); - FinishedSingleOp(); - } - } - - void Compact() { - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - std::string property; - char name[100]; - snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); - if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbi->TEST_CompactRange(level, "", "~"); - } - } - - void PrintStats() { - std::string stats; - if (!db_->GetProperty("leveldb.stats", &stats)) { - message_ = "(failed)"; - } else { - post_message_ = stats; - } - } - - static void WriteToFile(void* arg, const char* buf, int n) { - reinterpret_cast(arg)->Append(Slice(buf, n)); - } - - void HeapProfile() { - char fname[100]; - snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); - WritableFile* file; - Status s = Env::Default()->NewWritableFile(fname, &file); - if (!s.ok()) { - message_ = s.ToString(); - return; - } - bool ok = port::GetHeapProfile(WriteToFile, file); - delete file; - if (!ok) { - message_ = "not supported"; - Env::Default()->DeleteFile(fname); - } - } -}; - -} - -int main(int argc, char** argv) { - FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; - for (int i = 1; i < argc; i++) { - double d; - int n; - char junk; - if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { - FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); - } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { - FLAGS_compression_ratio = d; - } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && - (n == 0 || n == 1)) { - FLAGS_histogram = n; - } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { - FLAGS_num = n; - } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { - FLAGS_value_size = n; - } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { - FLAGS_write_buffer_size = n; - } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { - FLAGS_cache_size = n; - } else { - fprintf(stderr, "Invalid flag '%s'\n", argv[i]); - exit(1); - } - } - - leveldb::Benchmark benchmark; - benchmark.Run(); - return 0; -} diff --git a/leveldb/db/db_impl.cc b/leveldb/db/db_impl.cc deleted file mode 100644 index 3b9e04e..0000000 --- a/leveldb/db/db_impl.cc +++ /dev/null @@ -1,1188 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_impl.h" - -#include -#include -#include -#include -#include -#include -#include "db/builder.h" -#include "db/db_iter.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/status.h" -#include "leveldb/table.h" -#include "leveldb/table_builder.h" -#include "port/port.h" -#include "table/block.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" -#include "util/mutexlock.h" - -namespace leveldb { - -struct DBImpl::CompactionState { - Compaction* const compaction; - - // Sequence numbers < smallest_snapshot are not significant since we - // will never have to service a snapshot below smallest_snapshot. - // Therefore if we have seen a sequence number S <= smallest_snapshot, - // we can drop all entries for the same key with sequence numbers < S. - SequenceNumber smallest_snapshot; - - // Files produced by compaction - struct Output { - uint64_t number; - uint64_t file_size; - InternalKey smallest, largest; - }; - std::vector outputs; - - // State kept for output being generated - WritableFile* outfile; - TableBuilder* builder; - - uint64_t total_bytes; - - Output* current_output() { return &outputs[outputs.size()-1]; } - - explicit CompactionState(Compaction* c) - : compaction(c), - outfile(NULL), - builder(NULL), - total_bytes(0) { - } -}; - -namespace { -class NullWritableFile : public WritableFile { - public: - virtual Status Append(const Slice& data) { return Status::OK(); } - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } -}; -} - -// Fix user-supplied options to be reasonable -template -static void ClipToRange(T* ptr, V minvalue, V maxvalue) { - if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; - if (static_cast(*ptr) < minvalue) *ptr = minvalue; -} -Options SanitizeOptions(const std::string& dbname, - const InternalKeyComparator* icmp, - const Options& src) { - Options result = src; - result.comparator = icmp; - ClipToRange(&result.max_open_files, 20, 50000); - ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); - ClipToRange(&result.block_size, 1<<10, 4<<20); - if (result.info_log == NULL) { - // Open a log file in the same directory as the db - src.env->CreateDir(dbname); // In case it does not exist - src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); - Status s = src.env->NewWritableFile(InfoLogFileName(dbname), - &result.info_log); - if (!s.ok()) { - // No place suitable for logging - result.info_log = new NullWritableFile; - } - } - if (result.block_cache == NULL) { - result.block_cache = NewLRUCache(8 << 20); - } - return result; -} - -DBImpl::DBImpl(const Options& options, const std::string& dbname) - : env_(options.env), - internal_comparator_(options.comparator), - options_(SanitizeOptions(dbname, &internal_comparator_, options)), - owns_info_log_(options_.info_log != options.info_log), - owns_cache_(options_.block_cache != options.block_cache), - dbname_(dbname), - db_lock_(NULL), - shutting_down_(NULL), - bg_cv_(&mutex_), - compacting_cv_(&mutex_), - mem_(new MemTable(internal_comparator_)), - imm_(NULL), - logfile_(NULL), - log_(NULL), - bg_compaction_scheduled_(false), - compacting_(false) { - has_imm_.Release_Store(NULL); - - // Reserve ten files or so for other uses and give the rest to TableCache. - const int table_cache_size = options.max_open_files - 10; - table_cache_ = new TableCache(dbname_, &options_, table_cache_size); - - versions_ = new VersionSet(dbname_, &options_, table_cache_, - &internal_comparator_); -} - -DBImpl::~DBImpl() { - // Wait for background work to finish - mutex_.Lock(); - shutting_down_.Release_Store(this); // Any non-NULL value is ok - if (bg_compaction_scheduled_) { - while (bg_compaction_scheduled_) { - bg_cv_.Wait(); - } - } - mutex_.Unlock(); - - if (db_lock_ != NULL) { - env_->UnlockFile(db_lock_); - } - - delete versions_; - delete mem_; - delete imm_; - delete log_; - delete logfile_; - delete table_cache_; - - if (owns_info_log_) { - delete options_.info_log; - } - if (owns_cache_) { - delete options_.block_cache; - } -} - -Status DBImpl::NewDB() { - VersionEdit new_db; - new_db.SetComparatorName(user_comparator()->Name()); - new_db.SetLogNumber(0); - new_db.SetNextFile(2); - new_db.SetLastSequence(0); - - const std::string manifest = DescriptorFileName(dbname_, 1); - WritableFile* file; - Status s = env_->NewWritableFile(manifest, &file); - if (!s.ok()) { - return s; - } - { - log::Writer log(file); - std::string record; - new_db.EncodeTo(&record); - s = log.AddRecord(record); - if (s.ok()) { - s = file->Close(); - } - } - delete file; - if (s.ok()) { - // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1); - } else { - env_->DeleteFile(manifest); - } - return s; -} - -void DBImpl::MaybeIgnoreError(Status* s) const { - if (s->ok() || options_.paranoid_checks) { - // No change needed - } else { - Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str()); - *s = Status::OK(); - } -} - -void DBImpl::DeleteObsoleteFiles() { - // Make a set of all of the live files - std::set live = pending_outputs_; - versions_->AddLiveFiles(&live); - - std::vector filenames; - env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - bool keep = true; - switch (type) { - case kLogFile: - keep = ((number == versions_->LogNumber()) || - (number == versions_->PrevLogNumber())); - break; - case kDescriptorFile: - // Keep my manifest file, and any newer incarnations' - // (in case there is a race that allows other incarnations) - keep = (number >= versions_->ManifestFileNumber()); - break; - case kTableFile: - keep = (live.find(number) != live.end()); - break; - case kTempFile: - // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live" - keep = (live.find(number) != live.end()); - break; - case kCurrentFile: - case kDBLockFile: - case kInfoLogFile: - keep = true; - break; - } - - if (!keep) { - if (type == kTableFile) { - table_cache_->Evict(number); - } - Log(env_, options_.info_log, "Delete type=%d #%lld\n", - int(type), - static_cast(number)); - env_->DeleteFile(dbname_ + "/" + filenames[i]); - } - } - } -} - -Status DBImpl::Recover(VersionEdit* edit) { - mutex_.AssertHeld(); - - // Ignore error from CreateDir since the creation of the DB is - // committed only when the descriptor is created, and this directory - // may already exist from a previous failed creation attempt. - env_->CreateDir(dbname_); - assert(db_lock_ == NULL); - Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); - if (!s.ok()) { - return s; - } - - if (!env_->FileExists(CurrentFileName(dbname_))) { - if (options_.create_if_missing) { - s = NewDB(); - if (!s.ok()) { - return s; - } - } else { - return Status::InvalidArgument( - dbname_, "does not exist (create_if_missing is false)"); - } - } else { - if (options_.error_if_exists) { - return Status::InvalidArgument( - dbname_, "exists (error_if_exists is true)"); - } - } - - s = versions_->Recover(); - if (s.ok()) { - // Recover from the log files named in the descriptor - SequenceNumber max_sequence(0); - if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log - s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence); - } - if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state - s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence); - } - if (s.ok()) { - if (versions_->LastSequence() < max_sequence) { - versions_->SetLastSequence(max_sequence); - } - } - } - - return s; -} - -Status DBImpl::RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - WritableFile* info_log; - const char* fname; - Status* status; // NULL if options_.paranoid_checks==false - virtual void Corruption(size_t bytes, const Status& s) { - Log(env, info_log, "%s%s: dropping %d bytes; %s", - (this->status == NULL ? "(ignoring error) " : ""), - fname, static_cast(bytes), s.ToString().c_str()); - if (this->status != NULL && this->status->ok()) *this->status = s; - } - }; - - mutex_.AssertHeld(); - - // Open the log file - std::string fname = LogFileName(dbname_, log_number); - SequentialFile* file; - Status status = env_->NewSequentialFile(fname, &file); - if (!status.ok()) { - MaybeIgnoreError(&status); - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log; - reporter.fname = fname.c_str(); - reporter.status = (options_.paranoid_checks ? &status : NULL); - // We intentially make log::Reader do checksumming even if - // paranoid_checks==false so that corruptions cause entire commits - // to be skipped instead of propagating bad information (like overly - // large sequence numbers). - log::Reader reader(file, &reporter, true/*checksum*/); - Log(env_, options_.info_log, "Recovering log #%llu", - (unsigned long long) log_number); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - MemTable* mem = NULL; - while (reader.ReadRecord(&record, &scratch) && - status.ok()) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - - if (mem == NULL) { - mem = new MemTable(internal_comparator_); - } - status = WriteBatchInternal::InsertInto(&batch, mem); - MaybeIgnoreError(&status); - if (!status.ok()) { - break; - } - const SequenceNumber last_seq = - WriteBatchInternal::Sequence(&batch) + - WriteBatchInternal::Count(&batch) - 1; - if (last_seq > *max_sequence) { - *max_sequence = last_seq; - } - - if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { - status = WriteLevel0Table(mem, edit); - if (!status.ok()) { - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - break; - } - delete mem; - mem = NULL; - } - } - - if (status.ok() && mem != NULL) { - status = WriteLevel0Table(mem, edit); - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - } - - delete mem; - delete file; - return status; -} - -Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { - mutex_.AssertHeld(); - const uint64_t start_micros = env_->NowMicros(); - FileMetaData meta; - meta.number = versions_->NewFileNumber(); - pending_outputs_.insert(meta.number); - Iterator* iter = mem->NewIterator(); - Log(env_, options_.info_log, "Level-0 table #%llu: started", - (unsigned long long) meta.number); - - Status s; - { - mutex_.Unlock(); - s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit); - mutex_.Lock(); - } - - Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s", - (unsigned long long) meta.number, - (unsigned long long) meta.file_size, - s.ToString().c_str()); - delete iter; - pending_outputs_.erase(meta.number); - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.file_size; - stats_[0].Add(stats); - return s; -} - -Status DBImpl::CompactMemTable() { - mutex_.AssertHeld(); - assert(imm_ != NULL); - assert(compacting_); - - // Save the contents of the memtable as a new Table - VersionEdit edit; - Status s = WriteLevel0Table(imm_, &edit); - - // Replace immutable memtable with the generated Table - if (s.ok()) { - edit.SetPrevLogNumber(0); - s = versions_->LogAndApply(&edit, imm_); - } - - if (s.ok()) { - // Commit to the new state - imm_ = NULL; - has_imm_.Release_Store(NULL); - DeleteObsoleteFiles(); - } - - compacting_cv_.SignalAll(); // Wake up waiter even if there was an error - return s; -} - -void DBImpl::TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end) { - MutexLock l(&mutex_); - while (compacting_) { - compacting_cv_.Wait(); - } - Compaction* c = versions_->CompactRange( - level, - InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek), - InternalKey(end, 0, static_cast(0))); - - if (c != NULL) { - CompactionState* compact = new CompactionState(c); - DoCompactionWork(compact); // Ignore error in test compaction - CleanupCompaction(compact); - } - - // Start any background compaction that may have been delayed by this thread - MaybeScheduleCompaction(); -} - -Status DBImpl::TEST_CompactMemTable() { - MutexLock l(&mutex_); - Status s = MakeRoomForWrite(true /* force compaction */); - if (s.ok()) { - // Wait until the compaction completes - while (imm_ != NULL && bg_error_.ok()) { - compacting_cv_.Wait(); - } - if (imm_ != NULL) { - s = bg_error_; - } - } - return s; -} - -void DBImpl::MaybeScheduleCompaction() { - mutex_.AssertHeld(); - if (bg_compaction_scheduled_) { - // Already scheduled - } else if (compacting_) { - // Some other thread is running a compaction. Do not conflict with it. - } else if (shutting_down_.Acquire_Load()) { - // DB is being deleted; no more background compactions - } else if (imm_ == NULL && !versions_->NeedsCompaction()) { - // No work to be done - } else { - bg_compaction_scheduled_ = true; - env_->Schedule(&DBImpl::BGWork, this); - } -} - -void DBImpl::BGWork(void* db) { - reinterpret_cast(db)->BackgroundCall(); -} - -void DBImpl::BackgroundCall() { - MutexLock l(&mutex_); - assert(bg_compaction_scheduled_); - if (!shutting_down_.Acquire_Load() && - !compacting_) { - BackgroundCompaction(); - } - bg_compaction_scheduled_ = false; - bg_cv_.SignalAll(); - - // Previous compaction may have produced too many files in a level, - // so reschedule another compaction if needed. - MaybeScheduleCompaction(); -} - -void DBImpl::BackgroundCompaction() { - mutex_.AssertHeld(); - assert(!compacting_); - - if (imm_ != NULL) { - compacting_ = true; - CompactMemTable(); - compacting_ = false; - compacting_cv_.SignalAll(); - return; - } - - Compaction* c = versions_->PickCompaction(); - if (c == NULL) { - // Nothing to do - return; - } - - Status status; - if (c->IsTrivialMove()) { - // Move file to next level - assert(c->num_input_files(0) == 1); - FileMetaData* f = c->input(0, 0); - c->edit()->DeleteFile(c->level(), f->number); - c->edit()->AddFile(c->level() + 1, f->number, f->file_size, - f->smallest, f->largest); - status = versions_->LogAndApply(c->edit(), NULL); - Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", - static_cast(f->number), - c->level() + 1, - static_cast(f->file_size), - status.ToString().c_str()); - } else { - CompactionState* compact = new CompactionState(c); - status = DoCompactionWork(compact); - CleanupCompaction(compact); - } - delete c; - - if (status.ok()) { - // Done - } else if (shutting_down_.Acquire_Load()) { - // Ignore compaction errors found during shutting down - } else { - Log(env_, options_.info_log, - "Compaction error: %s", status.ToString().c_str()); - if (options_.paranoid_checks && bg_error_.ok()) { - bg_error_ = status; - } - } -} - -void DBImpl::CleanupCompaction(CompactionState* compact) { - mutex_.AssertHeld(); - if (compact->builder != NULL) { - // May happen if we get a shutdown call in the middle of compaction - compact->builder->Abandon(); - delete compact->builder; - } else { - assert(compact->outfile == NULL); - } - delete compact->outfile; - for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - pending_outputs_.erase(out.number); - } - delete compact; -} - -Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { - assert(compact != NULL); - assert(compact->builder == NULL); - uint64_t file_number; - { - mutex_.Lock(); - file_number = versions_->NewFileNumber(); - pending_outputs_.insert(file_number); - CompactionState::Output out; - out.number = file_number; - out.smallest.Clear(); - out.largest.Clear(); - compact->outputs.push_back(out); - mutex_.Unlock(); - } - - // Make the output file - std::string fname = TableFileName(dbname_, file_number); - Status s = env_->NewWritableFile(fname, &compact->outfile); - if (s.ok()) { - compact->builder = new TableBuilder(options_, compact->outfile); - } - return s; -} - -Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, - Iterator* input) { - assert(compact != NULL); - assert(compact->outfile != NULL); - assert(compact->builder != NULL); - - const uint64_t output_number = compact->current_output()->number; - assert(output_number != 0); - - // Check for iterator errors - Status s = input->status(); - const uint64_t current_entries = compact->builder->NumEntries(); - if (s.ok()) { - s = compact->builder->Finish(); - } else { - compact->builder->Abandon(); - } - const uint64_t current_bytes = compact->builder->FileSize(); - compact->current_output()->file_size = current_bytes; - compact->total_bytes += current_bytes; - delete compact->builder; - compact->builder = NULL; - - // Finish and check for file errors - if (s.ok()) { - s = compact->outfile->Sync(); - } - if (s.ok()) { - s = compact->outfile->Close(); - } - delete compact->outfile; - compact->outfile = NULL; - - if (s.ok() && current_entries > 0) { - // Verify that the table is usable - Iterator* iter = table_cache_->NewIterator(ReadOptions(), - output_number, - current_bytes); - s = iter->status(); - delete iter; - if (s.ok()) { - Log(env_, options_.info_log, - "Generated table #%llu: %lld keys, %lld bytes", - (unsigned long long) output_number, - (unsigned long long) current_entries, - (unsigned long long) current_bytes); - } - } - return s; -} - - -Status DBImpl::InstallCompactionResults(CompactionState* compact) { - mutex_.AssertHeld(); - Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1, - static_cast(compact->total_bytes)); - - // Add compaction outputs - compact->compaction->AddInputDeletions(compact->compaction->edit()); - const int level = compact->compaction->level(); - for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - compact->compaction->edit()->AddFile( - level + 1, - out.number, out.file_size, out.smallest, out.largest); - pending_outputs_.erase(out.number); - } - compact->outputs.clear(); - - Status s = versions_->LogAndApply(compact->compaction->edit(), NULL); - if (s.ok()) { - compact->compaction->ReleaseInputs(); - DeleteObsoleteFiles(); - } else { - // Discard any files we may have created during this failed compaction - for (size_t i = 0; i < compact->outputs.size(); i++) { - env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); - } - } - return s; -} - -Status DBImpl::DoCompactionWork(CompactionState* compact) { - const uint64_t start_micros = env_->NowMicros(); - int64_t imm_micros = 0; // Micros spent doing imm_ compactions - - Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1); - - assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); - assert(compact->builder == NULL); - assert(compact->outfile == NULL); - if (snapshots_.empty()) { - compact->smallest_snapshot = versions_->LastSequence(); - } else { - compact->smallest_snapshot = snapshots_.oldest()->number_; - } - - // Release mutex while we're actually doing the compaction work - compacting_ = true; - mutex_.Unlock(); - - Iterator* input = versions_->MakeInputIterator(compact->compaction); - input->SeekToFirst(); - Status status; - ParsedInternalKey ikey; - std::string current_user_key; - bool has_current_user_key = false; - SequenceNumber last_sequence_for_key = kMaxSequenceNumber; - for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Prioritize immutable compaction work - if (has_imm_.NoBarrier_Load() != NULL) { - const uint64_t imm_start = env_->NowMicros(); - mutex_.Lock(); - if (imm_ != NULL) { - CompactMemTable(); - compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary - } - mutex_.Unlock(); - imm_micros += (env_->NowMicros() - imm_start); - } - - Slice key = input->key(); - InternalKey tmp_internal_key; - tmp_internal_key.DecodeFrom(key); - if (compact->compaction->ShouldStopBefore(tmp_internal_key) && - compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - - // Handle key/value, add to state, etc. - bool drop = false; - if (!ParseInternalKey(key, &ikey)) { - // Do not hide error keys - current_user_key.clear(); - has_current_user_key = false; - last_sequence_for_key = kMaxSequenceNumber; - } else { - if (!has_current_user_key || - user_comparator()->Compare(ikey.user_key, - Slice(current_user_key)) != 0) { - // First occurrence of this user key - current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); - has_current_user_key = true; - last_sequence_for_key = kMaxSequenceNumber; - } - - if (last_sequence_for_key <= compact->smallest_snapshot) { - // Hidden by an newer entry for same user key - drop = true; // (A) - } else if (ikey.type == kTypeDeletion && - ikey.sequence <= compact->smallest_snapshot && - compact->compaction->IsBaseLevelForKey(ikey.user_key)) { - // For this user key: - // (1) there is no data in higher levels - // (2) data in lower levels will have larger sequence numbers - // (3) data in layers that are being compacted here and have - // smaller sequence numbers will be dropped in the next - // few iterations of this loop (by rule (A) above). - // Therefore this deletion marker is obsolete and can be dropped. - drop = true; - } - - last_sequence_for_key = ikey.sequence; - } -#if 0 - Log(env_, options_.info_log, - " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " - "%d smallest_snapshot: %d", - ikey.user_key.ToString().c_str(), - (int)ikey.sequence, ikey.type, kTypeValue, drop, - compact->compaction->IsBaseLevelForKey(ikey.user_key), - (int)last_sequence_for_key, (int)compact->smallest_snapshot); -#endif - - if (!drop) { - // Open output file if necessary - if (compact->builder == NULL) { - status = OpenCompactionOutputFile(compact); - if (!status.ok()) { - break; - } - } - if (compact->builder->NumEntries() == 0) { - compact->current_output()->smallest.DecodeFrom(key); - } - compact->current_output()->largest.DecodeFrom(key); - compact->builder->Add(key, input->value()); - - // Close output file if it is big enough - if (compact->builder->FileSize() >= - compact->compaction->MaxOutputFileSize()) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - } - - input->Next(); - } - - if (status.ok() && shutting_down_.Acquire_Load()) { - status = Status::IOError("Deleting DB during compaction"); - } - if (status.ok() && compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - } - if (status.ok()) { - status = input->status(); - } - delete input; - input = NULL; - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros - imm_micros; - for (int which = 0; which < 2; which++) { - for (int i = 0; i < compact->compaction->num_input_files(which); i++) { - stats.bytes_read += compact->compaction->input(which, i)->file_size; - } - } - for (size_t i = 0; i < compact->outputs.size(); i++) { - stats.bytes_written += compact->outputs[i].file_size; - } - - mutex_.Lock(); - stats_[compact->compaction->level() + 1].Add(stats); - - if (status.ok()) { - status = InstallCompactionResults(compact); - } - compacting_ = false; - compacting_cv_.SignalAll(); - return status; -} - -Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, - SequenceNumber* latest_snapshot) { - mutex_.Lock(); - *latest_snapshot = versions_->LastSequence(); - - // Collect together all needed child iterators - std::vector list; - list.push_back(mem_->NewIterator()); - if (imm_ != NULL) { - list.push_back(imm_->NewIterator()); - } - versions_->current()->AddIterators(options, &list); - Iterator* internal_iter = - NewMergingIterator(&internal_comparator_, &list[0], list.size()); - versions_->current()->Ref(); - internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current()); - - mutex_.Unlock(); - return internal_iter; -} - -Iterator* DBImpl::TEST_NewInternalIterator() { - SequenceNumber ignored; - return NewInternalIterator(ReadOptions(), &ignored); -} - -int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { - MutexLock l(&mutex_); - return versions_->MaxNextLevelOverlappingBytes(); -} - -Status DBImpl::Get(const ReadOptions& options, - const Slice& key, - std::string* value) { - // TODO(opt): faster implementation - Iterator* iter = NewIterator(options); - iter->Seek(key); - bool found = false; - if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) { - Slice v = iter->value(); - value->assign(v.data(), v.size()); - found = true; - } - // Non-OK iterator status trumps everything else - Status result = iter->status(); - if (result.ok() && !found) { - result = Status::NotFound(Slice()); // Use an empty error message for speed - } - delete iter; - return result; -} - -Iterator* DBImpl::NewIterator(const ReadOptions& options) { - SequenceNumber latest_snapshot; - Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); - SequenceNumber sequence = - (options.snapshot ? options.snapshot->number_ : latest_snapshot); - return NewDBIterator(&dbname_, env_, - user_comparator(), internal_iter, sequence); -} - -void DBImpl::Unref(void* arg1, void* arg2) { - DBImpl* impl = reinterpret_cast(arg1); - Version* v = reinterpret_cast(arg2); - MutexLock l(&impl->mutex_); - v->Unref(); -} - -const Snapshot* DBImpl::GetSnapshot() { - MutexLock l(&mutex_); - return snapshots_.New(versions_->LastSequence()); -} - -void DBImpl::ReleaseSnapshot(const Snapshot* s) { - MutexLock l(&mutex_); - snapshots_.Delete(s); -} - -// Convenience methods -Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { - return DB::Put(o, key, val); -} - -Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { - return DB::Delete(options, key); -} - -Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { - Status status; - MutexLock l(&mutex_); - status = MakeRoomForWrite(false); // May temporarily release lock and wait - uint64_t last_sequence = versions_->LastSequence(); - if (status.ok()) { - WriteBatchInternal::SetSequence(updates, last_sequence + 1); - last_sequence += WriteBatchInternal::Count(updates); - versions_->SetLastSequence(last_sequence); - - // Add to log and apply to memtable - status = log_->AddRecord(WriteBatchInternal::Contents(updates)); - if (status.ok() && options.sync) { - status = logfile_->Sync(); - } - if (status.ok()) { - status = WriteBatchInternal::InsertInto(updates, mem_); - } - } - if (options.post_write_snapshot != NULL) { - *options.post_write_snapshot = - status.ok() ? snapshots_.New(last_sequence) : NULL; - } - return status; -} - -Status DBImpl::MakeRoomForWrite(bool force) { - mutex_.AssertHeld(); - Status s; - while (true) { - if (!bg_error_.ok()) { - // Yield previous error - s = bg_error_; - break; - } else if (!force && - (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { - // There is room in current memtable - break; - } else if (imm_ != NULL) { - // We have filled up the current memtable, but the previous - // one is still being compacted, so we wait. - compacting_cv_.Wait(); - } else { - // Attempt to switch to a new memtable and trigger compaction of old - assert(versions_->PrevLogNumber() == 0); - uint64_t new_log_number = versions_->NewFileNumber(); - WritableFile* lfile = NULL; - s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); - if (!s.ok()) { - break; - } - VersionEdit edit; - edit.SetPrevLogNumber(versions_->LogNumber()); - edit.SetLogNumber(new_log_number); - s = versions_->LogAndApply(&edit, NULL); - if (!s.ok()) { - delete lfile; - env_->DeleteFile(LogFileName(dbname_, new_log_number)); - break; - } - delete log_; - delete logfile_; - logfile_ = lfile; - log_ = new log::Writer(lfile); - imm_ = mem_; - has_imm_.Release_Store(imm_); - mem_ = new MemTable(internal_comparator_); - force = false; // Do not force another compaction if have room - MaybeScheduleCompaction(); - } - } - return s; -} - -bool DBImpl::GetProperty(const Slice& property, std::string* value) { - value->clear(); - - MutexLock l(&mutex_); - Slice in = property; - Slice prefix("leveldb."); - if (!in.starts_with(prefix)) return false; - in.remove_prefix(prefix.size()); - - if (in.starts_with("num-files-at-level")) { - in.remove_prefix(strlen("num-files-at-level")); - uint64_t level; - bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || level < 0 || level >= config::kNumLevels) { - return false; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "%d", - versions_->NumLevelFiles(static_cast(level))); - *value = buf; - return true; - } - } else if (in == "stats") { - char buf[200]; - snprintf(buf, sizeof(buf), - " Compactions\n" - "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n" - "--------------------------------------------------\n" - ); - value->append(buf); - for (int level = 0; level < config::kNumLevels; level++) { - int files = versions_->NumLevelFiles(level); - if (stats_[level].micros > 0 || files > 0) { - snprintf( - buf, sizeof(buf), - "%3d %8d %8.0f %9.0f %8.0f %9.0f\n", - level, - files, - versions_->NumLevelBytes(level) / 1048576.0, - stats_[level].micros / 1e6, - stats_[level].bytes_read / 1048576.0, - stats_[level].bytes_written / 1048576.0); - value->append(buf); - } - } - return true; - } - - return false; -} - -void DBImpl::GetApproximateSizes( - const Range* range, int n, - uint64_t* sizes) { - // TODO(opt): better implementation - Version* v; - { - MutexLock l(&mutex_); - versions_->current()->Ref(); - v = versions_->current(); - } - - for (int i = 0; i < n; i++) { - // Convert user_key into a corresponding internal key. - InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); - uint64_t start = versions_->ApproximateOffsetOf(v, k1); - uint64_t limit = versions_->ApproximateOffsetOf(v, k2); - sizes[i] = (limit >= start ? limit - start : 0); - } - - { - MutexLock l(&mutex_); - v->Unref(); - } -} - -// Default implementations of convenience methods that subclasses of DB -// can call if they wish -Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { - WriteBatch batch; - batch.Put(key, value); - return Write(opt, &batch); -} - -Status DB::Delete(const WriteOptions& opt, const Slice& key) { - WriteBatch batch; - batch.Delete(key); - return Write(opt, &batch); -} - -DB::~DB() { } - -Status DB::Open(const Options& options, const std::string& dbname, - DB** dbptr) { - *dbptr = NULL; - - DBImpl* impl = new DBImpl(options, dbname); - impl->mutex_.Lock(); - VersionEdit edit; - Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists - if (s.ok()) { - uint64_t new_log_number = impl->versions_->NewFileNumber(); - WritableFile* lfile; - s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), - &lfile); - if (s.ok()) { - edit.SetLogNumber(new_log_number); - impl->logfile_ = lfile; - impl->log_ = new log::Writer(lfile); - s = impl->versions_->LogAndApply(&edit, NULL); - } - if (s.ok()) { - impl->DeleteObsoleteFiles(); - } - } - impl->mutex_.Unlock(); - if (s.ok()) { - *dbptr = impl; - } else { - delete impl; - } - return s; -} - -Status DestroyDB(const std::string& dbname, const Options& options) { - Env* env = options.env; - std::vector filenames; - // Ignore error in case directory does not exist - env->GetChildren(dbname, &filenames); - if (filenames.empty()) { - return Status::OK(); - } - - FileLock* lock; - Status result = env->LockFile(LockFileName(dbname), &lock); - if (result.ok()) { - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - Status del = env->DeleteFile(dbname + "/" + filenames[i]); - if (result.ok() && !del.ok()) { - result = del; - } - } - } - env->UnlockFile(lock); // Ignore error since state is already gone - env->DeleteFile(LockFileName(dbname)); - env->DeleteDir(dbname); // Ignore error in case dir contains other files - } - return result; -} - -} diff --git a/leveldb/db/db_impl.h b/leveldb/db/db_impl.h deleted file mode 100644 index 7699d8c..0000000 --- a/leveldb/db/db_impl.h +++ /dev/null @@ -1,184 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ -#define STORAGE_LEVELDB_DB_DB_IMPL_H_ - -#include -#include "db/dbformat.h" -#include "db/log_writer.h" -#include "db/snapshot.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "port/port.h" - -namespace leveldb { - -class MemTable; -class TableCache; -class Version; -class VersionEdit; -class VersionSet; - -class DBImpl : public DB { - public: - DBImpl(const Options& options, const std::string& dbname); - virtual ~DBImpl(); - - // Implementations of the DB interface - virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); - virtual Status Delete(const WriteOptions&, const Slice& key); - virtual Status Write(const WriteOptions& options, WriteBatch* updates); - virtual Status Get(const ReadOptions& options, - const Slice& key, - std::string* value); - virtual Iterator* NewIterator(const ReadOptions&); - virtual const Snapshot* GetSnapshot(); - virtual void ReleaseSnapshot(const Snapshot* snapshot); - virtual bool GetProperty(const Slice& property, std::string* value); - virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); - - // Extra methods (for testing) that are not in the public DB interface - - // Compact any files in the named level that overlap [begin,end] - void TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end); - - // Force current memtable contents to be compacted. - Status TEST_CompactMemTable(); - - // Return an internal iterator over the current state of the database. - // The keys of this iterator are internal keys (see format.h). - // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator(); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t TEST_MaxNextLevelOverlappingBytes(); - - private: - friend class DB; - - Iterator* NewInternalIterator(const ReadOptions&, - SequenceNumber* latest_snapshot); - - Status NewDB(); - - // Recover the descriptor from persistent storage. May do a significant - // amount of work to recover recently logged updates. Any changes to - // be made to the descriptor are added to *edit. - Status Recover(VersionEdit* edit); - - void MaybeIgnoreError(Status* s) const; - - // Delete any unneeded files and stale in-memory entries. - void DeleteObsoleteFiles(); - - // Called when an iterator over a particular version of the - // descriptor goes away. - static void Unref(void* arg1, void* arg2); - - // Compact the in-memory write buffer to disk. Switches to a new - // log-file/memtable and writes a new descriptor iff successful. - Status CompactMemTable(); - - Status RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence); - - Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); - - Status MakeRoomForWrite(bool force /* compact even if there is room? */); - - struct CompactionState; - - void MaybeScheduleCompaction(); - static void BGWork(void* db); - void BackgroundCall(); - void BackgroundCompaction(); - void CleanupCompaction(CompactionState* compact); - Status DoCompactionWork(CompactionState* compact); - - Status OpenCompactionOutputFile(CompactionState* compact); - Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); - Status InstallCompactionResults(CompactionState* compact); - - // Constant after construction - Env* const env_; - const InternalKeyComparator internal_comparator_; - const Options options_; // options_.comparator == &internal_comparator_ - bool owns_info_log_; - bool owns_cache_; - const std::string dbname_; - - // table_cache_ provides its own synchronization - TableCache* table_cache_; - - // Lock over the persistent DB state. Non-NULL iff successfully acquired. - FileLock* db_lock_; - - // State below is protected by mutex_ - port::Mutex mutex_; - port::AtomicPointer shutting_down_; - port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ - port::CondVar compacting_cv_; // Signalled when !compacting_ - MemTable* mem_; - MemTable* imm_; // Memtable being compacted - port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ - WritableFile* logfile_; - log::Writer* log_; - SnapshotList snapshots_; - - // Set of table files to protect from deletion because they are - // part of ongoing compactions. - std::set pending_outputs_; - - // Has a background compaction been scheduled or is running? - bool bg_compaction_scheduled_; - - // Is there a compaction running? - bool compacting_; - - VersionSet* versions_; - - // Have we encountered a background error in paranoid mode? - Status bg_error_; - - // Per level compaction stats. stats_[level] stores the stats for - // compactions that produced data for the specified "level". - struct CompactionStats { - int64_t micros; - int64_t bytes_read; - int64_t bytes_written; - - CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { } - - void Add(const CompactionStats& c) { - this->micros += c.micros; - this->bytes_read += c.bytes_read; - this->bytes_written += c.bytes_written; - } - }; - CompactionStats stats_[config::kNumLevels]; - - // No copying allowed - DBImpl(const DBImpl&); - void operator=(const DBImpl&); - - const Comparator* user_comparator() const { - return internal_comparator_.user_comparator(); - } -}; - -// Sanitize db options. The caller should delete result.info_log if -// it is not equal to src.info_log. -extern Options SanitizeOptions(const std::string& db, - const InternalKeyComparator* icmp, - const Options& src); - -} - -#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/leveldb/db/db_iter.cc b/leveldb/db/db_iter.cc deleted file mode 100644 index 0be18ff..0000000 --- a/leveldb/db/db_iter.cc +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_iter.h" - -#include "db/filename.h" -#include "db/dbformat.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/mutexlock.h" - -namespace leveldb { - -#if 0 -static void DumpInternalIter(Iterator* iter) { - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey k; - if (!ParseInternalKey(iter->key(), &k)) { - fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); - } else { - fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); - } - } -} -#endif - -namespace { - -// Memtables and sstables that make the DB representation contain -// (userkey,seq,type) => uservalue entries. DBIter -// combines multiple entries for the same userkey found in the DB -// representation into a single entry while accounting for sequence -// numbers, deletion markers, overwrites, etc. -class DBIter: public Iterator { - public: - // Which direction is the iterator currently moving? - // (1) When moving forward, the internal iterator is positioned at - // the exact entry that yields this->key(), this->value() - // (2) When moving backwards, the internal iterator is positioned - // just before all entries whose user key == this->key(). - enum Direction { - kForward, - kReverse - }; - - DBIter(const std::string* dbname, Env* env, - const Comparator* cmp, Iterator* iter, SequenceNumber s) - : dbname_(dbname), - env_(env), - user_comparator_(cmp), - iter_(iter), - sequence_(s), - direction_(kForward), - valid_(false) { - } - virtual ~DBIter() { - delete iter_; - } - virtual bool Valid() const { return valid_; } - virtual Slice key() const { - assert(valid_); - return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_; - } - virtual Slice value() const { - assert(valid_); - return (direction_ == kForward) ? iter_->value() : saved_value_; - } - virtual Status status() const { - if (status_.ok()) { - return iter_->status(); - } else { - return status_; - } - } - - virtual void Next(); - virtual void Prev(); - virtual void Seek(const Slice& target); - virtual void SeekToFirst(); - virtual void SeekToLast(); - - private: - void FindNextUserEntry(bool skipping, std::string* skip); - void FindPrevUserEntry(); - bool ParseKey(ParsedInternalKey* key); - - inline void SaveKey(const Slice& k, std::string* dst) { - dst->assign(k.data(), k.size()); - } - - inline void ClearSavedValue() { - if (saved_value_.capacity() > 1048576) { - std::string empty; - swap(empty, saved_value_); - } else { - saved_value_.clear(); - } - } - - const std::string* const dbname_; - Env* const env_; - const Comparator* const user_comparator_; - Iterator* const iter_; - SequenceNumber const sequence_; - - Status status_; - std::string saved_key_; // == current key when direction_==kReverse - std::string saved_value_; // == current raw value when direction_==kReverse - Direction direction_; - bool valid_; - - // No copying allowed - DBIter(const DBIter&); - void operator=(const DBIter&); -}; - -inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { - if (!ParseInternalKey(iter_->key(), ikey)) { - status_ = Status::Corruption("corrupted internal key in DBIter"); - return false; - } else { - return true; - } -} - -void DBIter::Next() { - assert(valid_); - - if (direction_ == kReverse) { // Switch directions? - direction_ = kForward; - // iter_ is pointing just before the entries for this->key(), - // so advance into the range of entries for this->key() and then - // use the normal skipping code below. - if (!iter_->Valid()) { - iter_->SeekToFirst(); - } else { - iter_->Next(); - } - if (!iter_->Valid()) { - valid_ = false; - saved_key_.clear(); - return; - } - } - - // Temporarily use saved_key_ as storage for key to skip. - std::string* skip = &saved_key_; - SaveKey(ExtractUserKey(iter_->key()), skip); - FindNextUserEntry(true, skip); -} - -void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { - // Loop until we hit an acceptable entry to yield - assert(iter_->Valid()); - assert(direction_ == kForward); - do { - ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - switch (ikey.type) { - case kTypeDeletion: - // Arrange to skip all upcoming entries for this key since - // they are hidden by this deletion. - SaveKey(ikey.user_key, skip); - skipping = true; - break; - case kTypeValue: - if (skipping && - user_comparator_->Compare(ikey.user_key, *skip) <= 0) { - // Entry hidden - } else { - valid_ = true; - saved_key_.clear(); - return; - } - break; - } - } - iter_->Next(); - } while (iter_->Valid()); - saved_key_.clear(); - valid_ = false; -} - -void DBIter::Prev() { - assert(valid_); - - if (direction_ == kForward) { // Switch directions? - // iter_ is pointing at the current entry. Scan backwards until - // the key changes so we can use the normal reverse scanning code. - assert(iter_->Valid()); // Otherwise valid_ would have been false - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); - while (true) { - iter_->Prev(); - if (!iter_->Valid()) { - valid_ = false; - saved_key_.clear(); - ClearSavedValue(); - return; - } - if (user_comparator_->Compare(ExtractUserKey(iter_->key()), - saved_key_) < 0) { - break; - } - } - direction_ = kReverse; - } - - FindPrevUserEntry(); -} - -void DBIter::FindPrevUserEntry() { - assert(direction_ == kReverse); - - ValueType value_type = kTypeDeletion; - if (iter_->Valid()) { - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); - do { - ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - if ((value_type != kTypeDeletion) && - user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { - // We encountered a non-deleted value in entries for previous keys, - break; - } - value_type = ikey.type; - if (value_type == kTypeDeletion) { - ClearSavedValue(); - } else { - Slice raw_value = iter_->value(); - if (saved_value_.capacity() > raw_value.size() + 1048576) { - std::string empty; - swap(empty, saved_value_); - } - saved_value_.assign(raw_value.data(), raw_value.size()); - } - } - iter_->Prev(); - } while (iter_->Valid()); - } - - if (value_type == kTypeDeletion) { - // End - valid_ = false; - saved_key_.clear(); - ClearSavedValue(); - direction_ = kForward; - } else { - valid_ = true; - } -} - -void DBIter::Seek(const Slice& target) { - direction_ = kForward; - ClearSavedValue(); - saved_key_.clear(); - AppendInternalKey( - &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); - iter_->Seek(saved_key_); - if (iter_->Valid()) { - FindNextUserEntry(false, &saved_key_ /* temporary storage */); - } else { - valid_ = false; - } -} - -void DBIter::SeekToFirst() { - direction_ = kForward; - ClearSavedValue(); - iter_->SeekToFirst(); - if (iter_->Valid()) { - FindNextUserEntry(false, &saved_key_ /* temporary storage */); - } else { - valid_ = false; - } -} - -void DBIter::SeekToLast() { - direction_ = kReverse; - ClearSavedValue(); - iter_->SeekToLast(); - FindPrevUserEntry(); -} - -} // anonymous namespace - -Iterator* NewDBIterator( - const std::string* dbname, - Env* env, - const Comparator* user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence) { - return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); -} - -} diff --git a/leveldb/db/db_iter.h b/leveldb/db/db_iter.h deleted file mode 100644 index 195f3d3..0000000 --- a/leveldb/db/db_iter.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ -#define STORAGE_LEVELDB_DB_DB_ITER_H_ - -#include -#include "leveldb/db.h" -#include "db/dbformat.h" - -namespace leveldb { - -// Return a new iterator that converts internal keys (yielded by -// "*internal_iter") that were live at the specified "sequence" number -// into appropriate user keys. -extern Iterator* NewDBIterator( - const std::string* dbname, - Env* env, - const Comparator* user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence); - -} - -#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/leveldb/db/db_test.cc b/leveldb/db/db_test.cc deleted file mode 100644 index f828e3d..0000000 --- a/leveldb/db/db_test.cc +++ /dev/null @@ -1,1030 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} - -class DBTest { - public: - std::string dbname_; - Env* env_; - DB* db_; - - Options last_options_; - - DBTest() : env_(Env::Default()) { - dbname_ = test::TmpDir() + "/db_test"; - DestroyDB(dbname_, Options()); - db_ = NULL; - Reopen(); - } - - ~DBTest() { - delete db_; - DestroyDB(dbname_, Options()); - } - - DBImpl* dbfull() { - return reinterpret_cast(db_); - } - - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); - } - - void DestroyAndReopen(Options* options = NULL) { - delete db_; - db_ = NULL; - DestroyDB(dbname_, Options()); - ASSERT_OK(TryReopen(options)); - } - - Status TryReopen(Options* options) { - delete db_; - db_ = NULL; - Options opts; - if (options != NULL) { - opts = *options; - } else { - opts.create_if_missing = true; - } - last_options_ = opts; - - return DB::Open(opts, dbname_, &db_); - } - - Status Put(const std::string& k, const std::string& v) { - return db_->Put(WriteOptions(), k, v); - } - - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } - - std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { - ReadOptions options; - options.snapshot = snapshot; - std::string result; - Status s = db_->Get(options, k, &result); - if (s.IsNotFound()) { - result = "NOT_FOUND"; - } else if (!s.ok()) { - result = s.ToString(); - } - return result; - } - - std::string AllEntriesFor(const Slice& user_key) { - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); - iter->Seek(target.Encode()); - std::string result; - if (!iter->status().ok()) { - result = iter->status().ToString(); - } else { - result = "[ "; - bool first = true; - while (iter->Valid()) { - ParsedInternalKey ikey; - if (!ParseInternalKey(iter->key(), &ikey)) { - result += "CORRUPTED"; - } else { - if (last_options_.comparator->Compare( - ikey.user_key, user_key) != 0) { - break; - } - if (!first) { - result += ", "; - } - first = false; - switch (ikey.type) { - case kTypeValue: - result += iter->value().ToString(); - break; - case kTypeDeletion: - result += "DEL"; - break; - } - } - iter->Next(); - } - if (!first) { - result += " "; - } - result += "]"; - } - delete iter; - return result; - } - - int NumTableFilesAtLevel(int level) { - std::string property; - ASSERT_TRUE( - db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), - &property)); - return atoi(property.c_str()); - } - - uint64_t Size(const Slice& start, const Slice& limit) { - Range r(start, limit); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - - void Compact(const Slice& start, const Slice& limit) { - dbfull()->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - if (NumTableFilesAtLevel(level) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbfull()->TEST_CompactRange(level, "", "~"); - } - } - - void DumpFileCounts(const char* label) { - fprintf(stderr, "---\n%s:\n", label); - fprintf(stderr, "maxoverlap: %lld\n", - static_cast( - dbfull()->TEST_MaxNextLevelOverlappingBytes())); - for (int level = 0; level < config::kNumLevels; level++) { - int num = NumTableFilesAtLevel(level); - if (num > 0) { - fprintf(stderr, " level %3d : %d files\n", level, num); - } - } - } - - std::string IterStatus(Iterator* iter) { - std::string result; - if (iter->Valid()) { - result = iter->key().ToString() + "->" + iter->value().ToString(); - } else { - result = "(invalid)"; - } - return result; - } -}; - -TEST(DBTest, Empty) { - ASSERT_TRUE(db_ != NULL); - ASSERT_EQ("NOT_FOUND", Get("foo")); -} - -TEST(DBTest, ReadWrite) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v2", Get("bar")); -} - -TEST(DBTest, PutDeleteGet) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - ASSERT_OK(db_->Delete(WriteOptions(), "foo")); - ASSERT_EQ("NOT_FOUND", Get("foo")); -} - -TEST(DBTest, IterEmpty) { - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("foo"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterSingle) { - ASSERT_OK(Put("a", "va")); - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterMulti) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", "vb")); - ASSERT_OK(Put("c", "vc")); - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("ax"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("z"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - // Switch from reverse to forward - iter->SeekToLast(); - iter->Prev(); - iter->Prev(); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Switch from forward to reverse - iter->SeekToFirst(); - iter->Next(); - iter->Next(); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Make sure iter stays at snapshot - ASSERT_OK(Put("a", "va2")); - ASSERT_OK(Put("a2", "va3")); - ASSERT_OK(Put("b", "vb2")); - ASSERT_OK(Put("c", "vc2")); - ASSERT_OK(Delete("b")); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterSmallAndLargeMix) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", std::string(100000, 'b'))); - ASSERT_OK(Put("c", "vc")); - ASSERT_OK(Put("d", std::string(100000, 'd'))); - ASSERT_OK(Put("e", std::string(100000, 'e'))); - - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, Recover) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("baz", "v5")); - - Reopen(); - ASSERT_EQ("v1", Get("foo")); - - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v5", Get("baz")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - - Reopen(); - ASSERT_EQ("v3", Get("foo")); - ASSERT_OK(Put("foo", "v4")); - ASSERT_EQ("v4", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ("v5", Get("baz")); -} - -TEST(DBTest, RecoveryWithEmptyLog) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("foo", "v2")); - Reopen(); - Reopen(); - ASSERT_OK(Put("foo", "v3")); - Reopen(); - ASSERT_EQ("v3", Get("foo")); -} - -static std::string Key(int i) { - char buf[100]; - snprintf(buf, sizeof(buf), "key%06d", i); - return std::string(buf); -} - -TEST(DBTest, MinorCompactionsHappen) { - Options options; - options.write_buffer_size = 10000; - Reopen(&options); - - const int N = 500; - - int starting_num_tables = NumTableFilesAtLevel(0); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); - } - int ending_num_tables = NumTableFilesAtLevel(0); - ASSERT_GT(ending_num_tables, starting_num_tables); - - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); - } - - Reopen(); - - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); - } -} - -TEST(DBTest, RecoverWithLargeLog) { - { - Options options; - Reopen(&options); - ASSERT_OK(Put("big1", std::string(200000, '1'))); - ASSERT_OK(Put("big2", std::string(200000, '2'))); - ASSERT_OK(Put("small3", std::string(10, '3'))); - ASSERT_OK(Put("small4", std::string(10, '4'))); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - } - - // Make sure that if we re-open with a small write buffer size that - // we flush table files in the middle of a large log file. - Options options; - options.write_buffer_size = 100000; - Reopen(&options); - ASSERT_EQ(NumTableFilesAtLevel(0), 3); - ASSERT_EQ(std::string(200000, '1'), Get("big1")); - ASSERT_EQ(std::string(200000, '2'), Get("big2")); - ASSERT_EQ(std::string(10, '3'), Get("small3")); - ASSERT_EQ(std::string(10, '4'), Get("small4")); - ASSERT_GT(NumTableFilesAtLevel(0), 1); -} - -TEST(DBTest, CompactionsGenerateMultipleFiles) { - Options options; - options.write_buffer_size = 100000000; // Large write buffer - Reopen(&options); - - Random rnd(301); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - std::vector values; - for (int i = 0; i < 80; i++) { - values.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(Key(i), values[i])); - } - - // Reopening moves updates to level-0 - Reopen(&options); - dbfull()->TEST_CompactRange(0, "", Key(100000)); - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 1); - for (int i = 0; i < 80; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); - } -} - -TEST(DBTest, SparseMerge) { - Options options; - options.compression = kNoCompression; - Reopen(&options); - - // Suppose there is: - // small amount of data with prefix A - // large amount of data with prefix B - // small amount of data with prefix C - // and that recent updates have made small changes to all three prefixes. - // Check that we do not do a compaction that merges all of B in one shot. - const std::string value(1000, 'x'); - Put("A", "va"); - // Write approximately 100MB of "B" values - for (int i = 0; i < 100000; i++) { - char key[100]; - snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); - } - Put("C", "vc"); - Compact("", "z"); - - // Make sparse update - Put("A", "va2"); - Put("B100", "bvalue2"); - Put("C", "vc2"); - dbfull()->TEST_CompactMemTable(); - - // Compactions should not cause us to create a situation where - // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); -} - -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; -} - -TEST(DBTest, ApproximateSizes) { - Options options; - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - DestroyAndReopen(); - - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - Reopen(&options); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - const int N = 80; - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); - } - - // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - for (int compact_start = 0; compact_start < N; compact_start += 10) { - for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); - ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), - 100000 * (i+1), 100000 * (i+1) + 10000)); - ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), - 100000 * 10, 100000 * 10 + 10000)); - } - ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); - ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); - - dbfull()->TEST_CompactRange(0, - Key(compact_start), - Key(compact_start + 9)); - } - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 0); - } -} - -TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { - Options options; - options.compression = kNoCompression; - Reopen(); - - Random rnd(301); - std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(2), big1)); - ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(4), big1)); - ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); - - ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); - - dbfull()->TEST_CompactRange(0, Key(0), Key(100)); - } -} - -TEST(DBTest, IteratorPinsRef) { - Put("foo", "hello"); - - // Get iterator that will yield the current contents of the DB. - Iterator* iter = db_->NewIterator(ReadOptions()); - - // Write to force compactions - Put("foo", "newvalue1"); - for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values - } - Put("foo", "newvalue2"); - - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("hello", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - delete iter; -} - -TEST(DBTest, Snapshot) { - Put("foo", "v1"); - const Snapshot* s1 = db_->GetSnapshot(); - Put("foo", "v2"); - const Snapshot* s2 = db_->GetSnapshot(); - Put("foo", "v3"); - const Snapshot* s3 = db_->GetSnapshot(); - - Put("foo", "v4"); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v3", Get("foo", s3)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s3); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s1); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s2); - ASSERT_EQ("v4", Get("foo")); -} - -TEST(DBTest, HiddenValuesAreRemoved) { - Random rnd(301); - std::string big = RandomString(&rnd, 50000); - Put("foo", big); - Put("pastfoo", "v"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put("foo", "tiny"); - Put("pastfoo2", "v2"); // Advance sequence number one more - - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_GT(NumTableFilesAtLevel(0), 0); - - ASSERT_EQ(big, Get("foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); - db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); - dbfull()->TEST_CompactRange(0, "", "x"); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GE(NumTableFilesAtLevel(1), 1); - dbfull()->TEST_CompactRange(1, "", "x"); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - - ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); -} - -TEST(DBTest, DeletionMarkers1) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file - Delete("foo"); - Put("foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - dbfull()->TEST_CompactRange(0, "", "z"); - // DEL eliminated, but v1 remains because we aren't compacting that level - // (DEL can be eliminated because v2 hides v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(1, "", "z"); - // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. - // (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); -} - -TEST(DBTest, DeletionMarkers2) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file - Delete("foo"); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(0, "", "z"); - // DEL kept: L2 file overlaps - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(1, "", "z"); - // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. - // (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); -} - -TEST(DBTest, ComparatorCheck) { - class NewComparator : public Comparator { - public: - virtual const char* Name() const { return "leveldb.NewComparator"; } - virtual int Compare(const Slice& a, const Slice& b) const { - return BytewiseComparator()->Compare(a, b); - } - virtual void FindShortestSeparator(std::string* s, const Slice& l) const { - BytewiseComparator()->FindShortestSeparator(s, l); - } - virtual void FindShortSuccessor(std::string* key) const { - BytewiseComparator()->FindShortSuccessor(key); - } - }; - NewComparator cmp; - Options new_options; - new_options.comparator = &cmp; - Status s = TryReopen(&new_options); - ASSERT_TRUE(!s.ok()); - ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) - << s.ToString(); -} - -TEST(DBTest, DBOpen_Options) { - std::string dbname = test::TmpDir() + "/db_options_test"; - DestroyDB(dbname, Options()); - - // Does not exist, and create_if_missing == false: error - DB* db = NULL; - Options opts; - opts.create_if_missing = false; - Status s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); - ASSERT_TRUE(db == NULL); - - // Does not exist, and create_if_missing == true: OK - opts.create_if_missing = true; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; - - // Does exist, and error_if_exists == true: error - opts.create_if_missing = false; - opts.error_if_exists = true; - s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); - ASSERT_TRUE(db == NULL); - - // Does exist, and error_if_exists == false: OK - opts.create_if_missing = true; - opts.error_if_exists = false; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; -} - -class ModelDB: public DB { - public: - explicit ModelDB(const Options& options): options_(options) { } - ~ModelDB() { } - virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { - return DB::Put(o, k, v); - } - virtual Status Delete(const WriteOptions& o, const Slice& key) { - return DB::Delete(o, key); - } - virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) { - assert(false); // Not implemented - return Status::NotFound(key); - } - virtual Iterator* NewIterator(const ReadOptions& options) { - if (options.snapshot == NULL) { - KVMap* saved = new KVMap; - *saved = map_; - return new ModelIter(saved, true); - } else { - const KVMap* snapshot_state = - reinterpret_cast(options.snapshot->number_); - return new ModelIter(snapshot_state, false); - } - } - virtual const Snapshot* GetSnapshot() { - KVMap* saved = new KVMap; - *saved = map_; - return snapshots_.New( - reinterpret_cast(saved)); - } - - virtual void ReleaseSnapshot(const Snapshot* snapshot) { - const KVMap* saved = reinterpret_cast(snapshot->number_); - delete saved; - snapshots_.Delete(snapshot); - } - virtual Status Write(const WriteOptions& options, WriteBatch* batch) { - assert(options.post_write_snapshot == NULL); // Not supported - for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeValue: - map_[it.key().ToString()] = it.value().ToString(); - break; - case kTypeDeletion: - map_.erase(it.key().ToString()); - break; - } - } - return Status::OK(); - } - - virtual bool GetProperty(const Slice& property, std::string* value) { - return false; - } - virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { - for (int i = 0; i < n; i++) { - sizes[i] = 0; - } - } - private: - typedef std::map KVMap; - class ModelIter: public Iterator { - public: - ModelIter(const KVMap* map, bool owned) - : map_(map), owned_(owned), iter_(map_->end()) { - } - ~ModelIter() { - if (owned_) delete map_; - } - virtual bool Valid() const { return iter_ != map_->end(); } - virtual void SeekToFirst() { iter_ = map_->begin(); } - virtual void SeekToLast() { - if (map_->empty()) { - iter_ = map_->end(); - } else { - iter_ = map_->find(map_->rbegin()->first); - } - } - virtual void Seek(const Slice& k) { - iter_ = map_->lower_bound(k.ToString()); - } - virtual void Next() { ++iter_; } - virtual void Prev() { --iter_; } - virtual Slice key() const { return iter_->first; } - virtual Slice value() const { return iter_->second; } - virtual Status status() const { return Status::OK(); } - private: - const KVMap* const map_; - const bool owned_; // Do we own map_ - KVMap::const_iterator iter_; - }; - const Options options_; - KVMap map_; - SnapshotList snapshots_; -}; - -static std::string RandomKey(Random* rnd) { - int len = (rnd->OneIn(3) - ? 1 // Short sometimes to encourage collisions - : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); - return test::RandomKey(rnd, len); -} - -static bool CompareIterators(int step, - DB* model, - DB* db, - const Snapshot* model_snap, - const Snapshot* db_snap) { - ReadOptions options; - options.snapshot = model_snap; - Iterator* miter = model->NewIterator(options); - options.snapshot = db_snap; - Iterator* dbiter = db->NewIterator(options); - bool ok = true; - int count = 0; - for (miter->SeekToFirst(), dbiter->SeekToFirst(); - ok && miter->Valid() && dbiter->Valid(); - miter->Next(), dbiter->Next()) { - count++; - if (miter->key().compare(dbiter->key()) != 0) { - fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(dbiter->key()).c_str()); - ok = false; - break; - } - - if (miter->value().compare(dbiter->value()) != 0) { - fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(miter->value()).c_str(), - EscapeString(miter->value()).c_str()); - ok = false; - } - } - - if (ok) { - if (miter->Valid() != dbiter->Valid()) { - fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", - step, miter->Valid(), dbiter->Valid()); - ok = false; - } - } - fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); - delete miter; - delete dbiter; - return ok; -} - -TEST(DBTest, Randomized) { - Random rnd(test::RandomSeed()); - ModelDB model(last_options_); - const int N = 10000; - const Snapshot* model_snap = NULL; - const Snapshot* db_snap = NULL; - std::string k, v; - for (int step = 0; step < N; step++) { - if (step % 100 == 0) { - fprintf(stderr, "Step %d of %d\n", step, N); - } - int p = rnd.Uniform(100); - if (p < 45) { // Put - k = RandomKey(&rnd); - v = RandomString(&rnd, - rnd.OneIn(20) - ? 100 + rnd.Uniform(100) - : rnd.Uniform(8)); - ASSERT_OK(model.Put(WriteOptions(), k, v)); - ASSERT_OK(db_->Put(WriteOptions(), k, v)); - - } else if (p < 90) { // Delete - k = RandomKey(&rnd); - ASSERT_OK(model.Delete(WriteOptions(), k)); - ASSERT_OK(db_->Delete(WriteOptions(), k)); - - - } else { // Multi-element batch - WriteBatch b; - const int num = rnd.Uniform(8); - for (int i = 0; i < num; i++) { - if (i == 0 || !rnd.OneIn(10)) { - k = RandomKey(&rnd); - } else { - // Periodically re-use the same key from the previous iter, so - // we have multiple entries in the write batch for the same key - } - if (rnd.OneIn(2)) { - v = RandomString(&rnd, rnd.Uniform(10)); - b.Put(k, v); - } else { - b.Delete(k); - } - } - ASSERT_OK(model.Write(WriteOptions(), &b)); - ASSERT_OK(db_->Write(WriteOptions(), &b)); - } - - if ((step % 100) == 0) { - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); - // Save a snapshot from each DB this time that we'll use next - // time we compare things, to make sure the current state is - // preserved with the snapshot - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); - - Reopen(); - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - - model_snap = model.GetSnapshot(); - db_snap = db_->GetSnapshot(); - } - } - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/dbformat.cc b/leveldb/db/dbformat.cc deleted file mode 100644 index c12c138..0000000 --- a/leveldb/db/dbformat.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "db/dbformat.h" -#include "port/port.h" -#include "util/coding.h" - -namespace leveldb { - -static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { - assert(seq <= kMaxSequenceNumber); - assert(t <= kValueTypeForSeek); - return (seq << 8) | t; -} - -void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { - result->append(key.user_key.data(), key.user_key.size()); - PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); -} - -std::string ParsedInternalKey::DebugString() const { - char buf[50]; - snprintf(buf, sizeof(buf), "' @ %llu : %d", - (unsigned long long) sequence, - int(type)); - std::string result = "'"; - result += user_key.ToString(); - result += buf; - return result; -} - -const char* InternalKeyComparator::Name() const { - return "leveldb.InternalKeyComparator"; -} - -int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { - // Order by: - // increasing user key (according to user-supplied comparator) - // decreasing sequence number - // decreasing type (though sequence# should be enough to disambiguate) - int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); - if (r == 0) { - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); - if (anum > bnum) { - r = -1; - } else if (anum < bnum) { - r = +1; - } - } - return r; -} - -void InternalKeyComparator::FindShortestSeparator( - std::string* start, - const Slice& limit) const { - // Attempt to shorten the user portion of the key - Slice user_start = ExtractUserKey(*start); - Slice user_limit = ExtractUserKey(limit); - std::string tmp(user_start.data(), user_start.size()); - user_comparator_->FindShortestSeparator(&tmp, user_limit); - if (user_comparator_->Compare(*start, tmp) < 0) { - // User key has become larger. Tack on the earliest possible - // number to the shortened user key. - PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); - assert(this->Compare(*start, tmp) < 0); - assert(this->Compare(tmp, limit) < 0); - start->swap(tmp); - } -} - -void InternalKeyComparator::FindShortSuccessor(std::string* key) const { - Slice user_key = ExtractUserKey(*key); - std::string tmp(user_key.data(), user_key.size()); - user_comparator_->FindShortSuccessor(&tmp); - if (user_comparator_->Compare(user_key, tmp) < 0) { - // User key has become larger. Tack on the earliest possible - // number to the shortened user key. - PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); - assert(this->Compare(*key, tmp) < 0); - key->swap(tmp); - } -} - -} diff --git a/leveldb/db/dbformat.h b/leveldb/db/dbformat.h deleted file mode 100644 index d583665..0000000 --- a/leveldb/db/dbformat.h +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ -#define STORAGE_LEVELDB_DB_FORMAT_H_ - -#include -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/slice.h" -#include "leveldb/table_builder.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -// Grouping of constants. We may want to make some of these -// parameters set via options. -namespace config { -static const int kNumLevels = 7; -} - -class InternalKey; - -// Value types encoded as the last component of internal keys. -// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk -// data structures. -enum ValueType { - kTypeDeletion = 0x0, - kTypeValue = 0x1, -}; -// kValueTypeForSeek defines the ValueType that should be passed when -// constructing a ParsedInternalKey object for seeking to a particular -// sequence number (since we sort sequence numbers in decreasing order -// and the value type is embedded as the low 8 bits in the sequence -// number in internal keys, we need to use the highest-numbered -// ValueType, not the lowest). -static const ValueType kValueTypeForSeek = kTypeValue; - -typedef uint64_t SequenceNumber; - -// We leave eight bits empty at the bottom so a type and sequence# -// can be packed together into 64-bits. -static const SequenceNumber kMaxSequenceNumber = - ((0x1ull << 56) - 1); - -struct ParsedInternalKey { - Slice user_key; - SequenceNumber sequence; - ValueType type; - - ParsedInternalKey() { } // Intentionally left uninitialized (for speed) - ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) - : user_key(u), sequence(seq), type(t) { } - std::string DebugString() const; -}; - -// Return the length of the encoding of "key". -inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { - return key.user_key.size() + 8; -} - -// Append the serialization of "key" to *result. -extern void AppendInternalKey(std::string* result, - const ParsedInternalKey& key); - -// Attempt to parse an internal key from "internal_key". On success, -// stores the parsed data in "*result", and returns true. -// -// On error, returns false, leaves "*result" in an undefined state. -extern bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result); - -// Returns the user key portion of an internal key. -inline Slice ExtractUserKey(const Slice& internal_key) { - assert(internal_key.size() >= 8); - return Slice(internal_key.data(), internal_key.size() - 8); -} - -inline ValueType ExtractValueType(const Slice& internal_key) { - assert(internal_key.size() >= 8); - const size_t n = internal_key.size(); - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; - return static_cast(c); -} - -// A comparator for internal keys that uses a specified comparator for -// the user key portion and breaks ties by decreasing sequence number. -class InternalKeyComparator : public Comparator { - private: - const Comparator* user_comparator_; - public: - explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } - virtual const char* Name() const; - virtual int Compare(const Slice& a, const Slice& b) const; - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const; - virtual void FindShortSuccessor(std::string* key) const; - - const Comparator* user_comparator() const { return user_comparator_; } - - int Compare(const InternalKey& a, const InternalKey& b) const; -}; - -// Modules in this directory should keep internal keys wrapped inside -// the following class instead of plain strings so that we do not -// incorrectly use string comparisons instead of an InternalKeyComparator. -class InternalKey { - private: - std::string rep_; - public: - InternalKey() { } // Leave rep_ as empty to indicate it is invalid - InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { - AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); - } - - void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } - Slice Encode() const { - assert(!rep_.empty()); - return rep_; - } - - Slice user_key() const { return ExtractUserKey(rep_); } - - void SetFrom(const ParsedInternalKey& p) { - rep_.clear(); - AppendInternalKey(&rep_, p); - } - - void Clear() { rep_.clear(); } -}; - -inline int InternalKeyComparator::Compare( - const InternalKey& a, const InternalKey& b) const { - return Compare(a.Encode(), b.Encode()); -} - -inline bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result) { - const size_t n = internal_key.size(); - if (n < 8) return false; - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; - result->sequence = num >> 8; - result->type = static_cast(c); - result->user_key = Slice(internal_key.data(), n - 8); - return (c <= static_cast(kTypeValue)); -} - -} - -#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/leveldb/db/dbformat_test.cc b/leveldb/db/dbformat_test.cc deleted file mode 100644 index 57c5578..0000000 --- a/leveldb/db/dbformat_test.cc +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/dbformat.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -static std::string IKey(const std::string& user_key, - uint64_t seq, - ValueType vt) { - std::string encoded; - AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); - return encoded; -} - -static std::string Shorten(const std::string& s, const std::string& l) { - std::string result = s; - InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); - return result; -} - -static std::string ShortSuccessor(const std::string& s) { - std::string result = s; - InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); - return result; -} - -static void TestKey(const std::string& key, - uint64_t seq, - ValueType vt) { - std::string encoded = IKey(key, seq, vt); - - Slice in(encoded); - ParsedInternalKey decoded("", 0, kTypeValue); - - ASSERT_TRUE(ParseInternalKey(in, &decoded)); - ASSERT_EQ(key, decoded.user_key.ToString()); - ASSERT_EQ(seq, decoded.sequence); - ASSERT_EQ(vt, decoded.type); - - ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); -} - -class FormatTest { }; - -TEST(FormatTest, InternalKey_EncodeDecode) { - const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; - const uint64_t seq[] = { - 1, 2, 3, - (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, - (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, - (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 - }; - for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { - for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { - TestKey(keys[k], seq[s], kTypeValue); - TestKey("hello", 1, kTypeDeletion); - } - } -} - -TEST(FormatTest, InternalKeyShortSeparator) { - // When user keys are same - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 99, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 101, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeDeletion))); - - // When user keys are misordered - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("bar", 99, kTypeValue))); - - // When user keys are different, but correctly ordered - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - Shorten(IKey("foo", 100, kTypeValue), - IKey("hello", 200, kTypeValue))); - - // When start user key is prefix of limit user key - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foobar", 200, kTypeValue))); - - // When limit user key is prefix of start user key - ASSERT_EQ(IKey("foobar", 100, kTypeValue), - Shorten(IKey("foobar", 100, kTypeValue), - IKey("foo", 200, kTypeValue))); -} - -TEST(FormatTest, InternalKeyShortestSuccessor) { - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - ShortSuccessor(IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), - ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/filename.cc b/leveldb/db/filename.cc deleted file mode 100644 index b3a917c..0000000 --- a/leveldb/db/filename.cc +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include "db/filename.h" -#include "db/dbformat.h" -#include "leveldb/env.h" -#include "util/logging.h" - -namespace leveldb { - -static std::string MakeFileName(const std::string& name, uint64_t number, - const char* suffix) { - char buf[100]; - snprintf(buf, sizeof(buf), "/%06llu.%s", - static_cast(number), - suffix); - return name + buf; -} - -std::string LogFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "log"); -} - -std::string TableFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "sst"); -} - -std::string DescriptorFileName(const std::string& dbname, uint64_t number) { - assert(number > 0); - char buf[100]; - snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", - static_cast(number)); - return dbname + buf; -} - -std::string CurrentFileName(const std::string& dbname) { - return dbname + "/CURRENT"; -} - -std::string LockFileName(const std::string& dbname) { - return dbname + "/LOCK"; -} - -std::string TempFileName(const std::string& dbname, uint64_t number) { - assert(number > 0); - return MakeFileName(dbname, number, "dbtmp"); -} - -std::string InfoLogFileName(const std::string& dbname) { - return dbname + "/LOG"; -} - -// Return the name of the old info log file for "dbname". -std::string OldInfoLogFileName(const std::string& dbname) { - return dbname + "/LOG.old"; -} - - -// Owned filenames have the form: -// dbname/CURRENT -// dbname/LOCK -// dbname/LOG -// dbname/LOG.old -// dbname/MANIFEST-[0-9]+ -// dbname/[0-9]+.(log|sst) -bool ParseFileName(const std::string& fname, - uint64_t* number, - FileType* type) { - Slice rest(fname); - if (rest == "CURRENT") { - *number = 0; - *type = kCurrentFile; - } else if (rest == "LOCK") { - *number = 0; - *type = kDBLockFile; - } else if (rest == "LOG" || rest == "LOG.old") { - *number = 0; - *type = kInfoLogFile; - } else if (rest.starts_with("MANIFEST-")) { - rest.remove_prefix(strlen("MANIFEST-")); - uint64_t num; - if (!ConsumeDecimalNumber(&rest, &num)) { - return false; - } - if (!rest.empty()) { - return false; - } - *type = kDescriptorFile; - *number = num; - } else { - // Avoid strtoull() to keep filename format independent of the - // current locale - uint64_t num; - if (!ConsumeDecimalNumber(&rest, &num)) { - return false; - } - Slice suffix = rest; - if (suffix == Slice(".log")) { - *type = kLogFile; - } else if (suffix == Slice(".sst")) { - *type = kTableFile; - } else if (suffix == Slice(".dbtmp")) { - *type = kTempFile; - } else { - return false; - } - *number = num; - } - return true; -} - -Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number) { - // Remove leading "dbname/" and add newline to manifest file name - std::string manifest = DescriptorFileName(dbname, descriptor_number); - Slice contents = manifest; - assert(contents.starts_with(dbname + "/")); - contents.remove_prefix(dbname.size() + 1); - std::string tmp = TempFileName(dbname, descriptor_number); - Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); - if (s.ok()) { - s = env->RenameFile(tmp, CurrentFileName(dbname)); - } - if (!s.ok()) { - env->DeleteFile(tmp); - } - return s; -} - -} diff --git a/leveldb/db/filename.h b/leveldb/db/filename.h deleted file mode 100644 index 6a99744..0000000 --- a/leveldb/db/filename.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// File names used by DB code - -#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ -#define STORAGE_LEVELDB_DB_FILENAME_H_ - -#include -#include -#include "leveldb/slice.h" -#include "leveldb/status.h" -#include "port/port.h" - -namespace leveldb { - -class Env; - -enum FileType { - kLogFile, - kDBLockFile, - kTableFile, - kDescriptorFile, - kCurrentFile, - kTempFile, - kInfoLogFile, // Either the current one, or an old one -}; - -// Return the name of the log file with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string LogFileName(const std::string& dbname, uint64_t number); - -// Return the name of the sstable with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string TableFileName(const std::string& dbname, uint64_t number); - -// Return the name of the descriptor file for the db named by -// "dbname" and the specified incarnation number. The result will be -// prefixed with "dbname". -extern std::string DescriptorFileName(const std::string& dbname, - uint64_t number); - -// Return the name of the current file. This file contains the name -// of the current manifest file. The result will be prefixed with -// "dbname". -extern std::string CurrentFileName(const std::string& dbname); - -// Return the name of the lock file for the db named by -// "dbname". The result will be prefixed with "dbname". -extern std::string LockFileName(const std::string& dbname); - -// Return the name of a temporary file owned by the db named "dbname". -// The result will be prefixed with "dbname". -extern std::string TempFileName(const std::string& dbname, uint64_t number); - -// Return the name of the info log file for "dbname". -extern std::string InfoLogFileName(const std::string& dbname); - -// Return the name of the old info log file for "dbname". -extern std::string OldInfoLogFileName(const std::string& dbname); - -// If filename is a leveldb file, store the type of the file in *type. -// The number encoded in the filename is stored in *number. If the -// filename was successfully parsed, returns true. Else return false. -extern bool ParseFileName(const std::string& filename, - uint64_t* number, - FileType* type); - -// Make the CURRENT file point to the descriptor file with the -// specified number. -extern Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number); - - -} - -#endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/leveldb/db/filename_test.cc b/leveldb/db/filename_test.cc deleted file mode 100644 index 2f61e8d..0000000 --- a/leveldb/db/filename_test.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/filename.h" - -#include "db/dbformat.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -class FileNameTest { }; - -TEST(FileNameTest, Parse) { - Slice db; - FileType type; - uint64_t number; - - // Successful parses - static struct { - const char* fname; - uint64_t number; - FileType type; - } cases[] = { - { "100.log", 100, kLogFile }, - { "0.log", 0, kLogFile }, - { "0.sst", 0, kTableFile }, - { "CURRENT", 0, kCurrentFile }, - { "LOCK", 0, kDBLockFile }, - { "MANIFEST-2", 2, kDescriptorFile }, - { "MANIFEST-7", 7, kDescriptorFile }, - { "LOG", 0, kInfoLogFile }, - { "LOG.old", 0, kInfoLogFile }, - { "18446744073709551615.log", 18446744073709551615ull, kLogFile }, - }; - for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { - std::string f = cases[i].fname; - ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; - ASSERT_EQ(cases[i].type, type) << f; - ASSERT_EQ(cases[i].number, number) << f; - } - - // Errors - static const char* errors[] = { - "", - "foo", - "foo-dx-100.log", - ".log", - "", - "manifest", - "CURREN", - "CURRENTX", - "MANIFES", - "MANIFEST", - "MANIFEST-", - "XMANIFEST-3", - "MANIFEST-3x", - "LOC", - "LOCKx", - "LO", - "LOGx", - "18446744073709551616.log", - "184467440737095516150.log", - "100", - "100.", - "100.lop" - }; - for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { - std::string f = errors[i]; - ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; - }; -} - -TEST(FileNameTest, Construction) { - uint64_t number; - FileType type; - std::string fname; - - fname = CurrentFileName("foo"); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(0, number); - ASSERT_EQ(kCurrentFile, type); - - fname = LockFileName("foo"); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(0, number); - ASSERT_EQ(kDBLockFile, type); - - fname = LogFileName("foo", 192); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(192, number); - ASSERT_EQ(kLogFile, type); - - fname = TableFileName("bar", 200); - ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(200, number); - ASSERT_EQ(kTableFile, type); - - fname = DescriptorFileName("bar", 100); - ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(100, number); - ASSERT_EQ(kDescriptorFile, type); - - fname = TempFileName("tmp", 999); - ASSERT_EQ("tmp/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(999, number); - ASSERT_EQ(kTempFile, type); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/log_format.h b/leveldb/db/log_format.h deleted file mode 100644 index 137cd4a..0000000 --- a/leveldb/db/log_format.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Log format information shared by reader and writer. -// See ../doc/log_format.txt for more detail. - -#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ -#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ - -namespace leveldb { -namespace log { - -enum RecordType { - // Zero is reserved for preallocated files - kZeroType = 0, - - kFullType = 1, - - // For fragments - kFirstType = 2, - kMiddleType = 3, - kLastType = 4, -}; -static const int kMaxRecordType = kLastType; - -static const int kBlockSize = 32768; - -// Header is checksum (4 bytes), type (1 byte), length (2 bytes). -static const int kHeaderSize = 4 + 1 + 2; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/leveldb/db/log_reader.cc b/leveldb/db/log_reader.cc deleted file mode 100644 index 75e1d28..0000000 --- a/leveldb/db/log_reader.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_reader.h" - -#include -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { -namespace log { - -Reader::Reporter::~Reporter() { -} - -Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) - : file_(file), - reporter_(reporter), - checksum_(checksum), - backing_store_(new char[kBlockSize]), - buffer_(), - eof_(false) { -} - -Reader::~Reader() { - delete[] backing_store_; -} - -bool Reader::ReadRecord(Slice* record, std::string* scratch) { - scratch->clear(); - record->clear(); - bool in_fragmented_record = false; - - Slice fragment; - while (true) { - switch (ReadPhysicalRecord(&fragment)) { - case kFullType: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - } - scratch->clear(); - *record = fragment; - return true; - - case kFirstType: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - } - scratch->assign(fragment.data(), fragment.size()); - in_fragmented_record = true; - break; - - case kMiddleType: - if (!in_fragmented_record) { - ReportDrop(fragment.size(), "missing start of fragmented record"); - } else { - scratch->append(fragment.data(), fragment.size()); - } - break; - - case kLastType: - if (!in_fragmented_record) { - ReportDrop(fragment.size(), "missing start of fragmented record"); - } else { - scratch->append(fragment.data(), fragment.size()); - *record = Slice(*scratch); - return true; - } - break; - - case kEof: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); - scratch->clear(); - } - return false; - - case kBadRecord: - if (in_fragmented_record) { - ReportDrop(scratch->size(), "error in middle of record"); - in_fragmented_record = false; - scratch->clear(); - } - break; - - default: - ReportDrop( - (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), - "unknown record type"); - in_fragmented_record = false; - scratch->clear(); - break; - } - } - return false; -} - -void Reader::ReportDrop(size_t bytes, const char* reason) { - if (reporter_ != NULL) { - reporter_->Corruption(bytes, Status::Corruption(reason)); - } -} - -unsigned int Reader::ReadPhysicalRecord(Slice* result) { - while (true) { - if (buffer_.size() < kHeaderSize) { - if (!eof_) { - // Last read was a full read, so this is a trailer to skip - buffer_.clear(); - Status status = file_->Read(kBlockSize, &buffer_, backing_store_); - if (!status.ok()) { - if (reporter_ != NULL) { - reporter_->Corruption(kBlockSize, status); - } - buffer_.clear(); - eof_ = true; - return kEof; - } else if (buffer_.size() < kBlockSize) { - eof_ = true; - } - continue; - } else if (buffer_.size() == 0) { - // End of file - return kEof; - } else { - ReportDrop(buffer_.size(), "truncated record at end of file"); - buffer_.clear(); - return kEof; - } - } - - // Parse the header - const char* header = buffer_.data(); - const uint32_t a = static_cast(header[4]) & 0xff; - const uint32_t b = static_cast(header[5]) & 0xff; - const unsigned int type = header[6]; - const uint32_t length = a | (b << 8); - if (kHeaderSize + length > buffer_.size()) { - ReportDrop(buffer_.size(), "bad record length"); - buffer_.clear(); - return kBadRecord; - } - - // Check crc - if (checksum_) { - if (type == kZeroType && length == 0) { - // Skip zero length record without reporting any drops since - // such records are produced by the mmap based writing code in - // env_posix.cc that preallocates file regions. - buffer_.clear(); - return kBadRecord; - } - - uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); - uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); - if (actual_crc != expected_crc) { - // Drop the rest of the buffer since "length" itself may have - // been corrupted and if we trust it, we could find some - // fragment of a real log record that just happens to look - // like a valid log record. - ReportDrop(buffer_.size(), "checksum mismatch"); - buffer_.clear(); - return kBadRecord; - } - } - - buffer_.remove_prefix(kHeaderSize + length); - *result = Slice(header + kHeaderSize, length); - return type; - } -} - -} -} diff --git a/leveldb/db/log_reader.h b/leveldb/db/log_reader.h deleted file mode 100644 index baf1475..0000000 --- a/leveldb/db/log_reader.h +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ -#define STORAGE_LEVELDB_DB_LOG_READER_H_ - -#include "db/log_format.h" -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class SequentialFile; - -namespace log { - -class Reader { - public: - // Interface for reporting errors. - class Reporter { - public: - virtual ~Reporter(); - - // Some corruption was detected. "size" is the approximate number - // of bytes dropped due to the corruption. - virtual void Corruption(size_t bytes, const Status& status) = 0; - }; - - // Create a reader that will return log records from "*file". - // "*file" must remain live while this Reader is in use. - // - // If "reporter" is non-NULL, it is notified whenever some data is - // dropped due to a detected corruption. "*reporter" must remain - // live while this Reader is in use. - // - // If "checksum" is true, verify checksums if available. - Reader(SequentialFile* file, Reporter* reporter, bool checksum); - - ~Reader(); - - // Read the next record into *record. Returns true if read - // successfully, false if we hit end of the input. May use - // "*scratch" as temporary storage. The contents filled in *record - // will only be valid until the next mutating operation on this - // reader or the next mutation to *scratch. - bool ReadRecord(Slice* record, std::string* scratch); - - private: - SequentialFile* const file_; - Reporter* const reporter_; - bool const checksum_; - char* const backing_store_; - Slice buffer_; - bool eof_; // Last Read() indicated EOF by returning < kBlockSize - - // Extend record types with the following special values - enum { - kEof = kMaxRecordType + 1, - kBadRecord = kMaxRecordType + 2 - }; - - // Return type, or one of the preceding special values - unsigned int ReadPhysicalRecord(Slice* result); - void ReportDrop(size_t bytes, const char* reason); - - // No copying allowed - Reader(const Reader&); - void operator=(const Reader&); -}; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/leveldb/db/log_test.cc b/leveldb/db/log_test.cc deleted file mode 100644 index 025a5ff..0000000 --- a/leveldb/db/log_test.cc +++ /dev/null @@ -1,361 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { -namespace log { - -// Construct a string of the specified length made out of the supplied -// partial string. -static std::string BigString(const std::string& partial_string, size_t n) { - std::string result; - while (result.size() < n) { - result.append(partial_string); - } - result.resize(n); - return result; -} - -// Construct a string from a number -static std::string NumberString(int n) { - char buf[50]; - snprintf(buf, sizeof(buf), "%d.", n); - return std::string(buf); -} - -// Return a skewed potentially long string -static std::string RandomSkewedString(int i, Random* rnd) { - return BigString(NumberString(i), rnd->Skewed(17)); -} - -class LogTest { - private: - class StringDest : public WritableFile { - public: - std::string contents_; - - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } - virtual Status Append(const Slice& slice) { - contents_.append(slice.data(), slice.size()); - return Status::OK(); - } - }; - - class StringSource : public SequentialFile { - public: - Slice contents_; - bool force_error_; - bool returned_partial_; - StringSource() : force_error_(false), returned_partial_(false) { } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; - ASSERT_EQ(kBlockSize, n); - - if (force_error_) { - force_error_ = false; - returned_partial_ = true; - return Status::Corruption("read error"); - } - - if (contents_.size() < n) { - n = contents_.size(); - returned_partial_ = true; - } - *result = Slice(contents_.data(), n); - contents_.remove_prefix(n); - return Status::OK(); - } - }; - - class ReportCollector : public Reader::Reporter { - public: - size_t dropped_bytes_; - std::string message_; - - ReportCollector() : dropped_bytes_(0) { } - virtual void Corruption(size_t bytes, const Status& status) { - dropped_bytes_ += bytes; - message_.append(status.ToString()); - } - }; - - StringDest dest_; - StringSource source_; - ReportCollector report_; - bool reading_; - Writer writer_; - Reader reader_; - - public: - LogTest() : reading_(false), - writer_(&dest_), - reader_(&source_, &report_, true/*checksum*/) { - } - - void Write(const std::string& msg) { - ASSERT_TRUE(!reading_) << "Write() after starting to read"; - writer_.AddRecord(Slice(msg)); - } - - size_t WrittenBytes() const { - return dest_.contents_.size(); - } - - std::string Read() { - if (!reading_) { - reading_ = true; - source_.contents_ = Slice(dest_.contents_); - } - std::string scratch; - Slice record; - if (reader_.ReadRecord(&record, &scratch)) { - return record.ToString(); - } else { - return "EOF"; - } - } - - void IncrementByte(int offset, int delta) { - dest_.contents_[offset] += delta; - } - - void SetByte(int offset, char new_byte) { - dest_.contents_[offset] = new_byte; - } - - void ShrinkSize(int bytes) { - dest_.contents_.resize(dest_.contents_.size() - bytes); - } - - void FixChecksum(int header_offset, int len) { - // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); - crc = crc32c::Mask(crc); - EncodeFixed32(&dest_.contents_[header_offset], crc); - } - - void ForceError() { - source_.force_error_ = true; - } - - size_t DroppedBytes() const { - return report_.dropped_bytes_; - } - - // Returns OK iff recorded error message contains "msg" - std::string MatchError(const std::string& msg) const { - if (report_.message_.find(msg) == std::string::npos) { - return report_.message_; - } else { - return "OK"; - } - } -}; - -TEST(LogTest, Empty) { - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, ReadWrite) { - Write("foo"); - Write("bar"); - Write(""); - Write("xxxx"); - ASSERT_EQ("foo", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("xxxx", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ("EOF", Read()); // Make sure reads at eof work -} - -TEST(LogTest, ManyBlocks) { - for (int i = 0; i < 100000; i++) { - Write(NumberString(i)); - } - for (int i = 0; i < 100000; i++) { - ASSERT_EQ(NumberString(i), Read()); - } - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, Fragmentation) { - Write("small"); - Write(BigString("medium", 50000)); - Write(BigString("large", 100000)); - ASSERT_EQ("small", Read()); - ASSERT_EQ(BigString("medium", 50000), Read()); - ASSERT_EQ(BigString("large", 100000), Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, MarginalTrailer) { - // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); - Write(""); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, ShortTrailer) { - const int n = kBlockSize - 2*kHeaderSize + 4; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); - Write(""); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, AlignedEof) { - const int n = kBlockSize - 2*kHeaderSize + 4; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, RandomRead) { - const int N = 500; - Random write_rnd(301); - for (int i = 0; i < N; i++) { - Write(RandomSkewedString(i, &write_rnd)); - } - Random read_rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); - } - ASSERT_EQ("EOF", Read()); -} - -// Tests of all the error paths in log_reader.cc follow: - -TEST(LogTest, ReadError) { - Write("foo"); - ForceError(); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kBlockSize, DroppedBytes()); - ASSERT_EQ("OK", MatchError("read error")); -} - -TEST(LogTest, BadRecordType) { - Write("foo"); - // Type is stored in header[6] - IncrementByte(6, 100); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("unknown record type")); -} - -TEST(LogTest, TruncatedTrailingRecord) { - Write("foo"); - ShrinkSize(4); // Drop all payload as well as a header byte - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); - ASSERT_EQ("OK", MatchError("truncated record at end of file")); -} - -TEST(LogTest, BadLength) { - Write("foo"); - ShrinkSize(1); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); - ASSERT_EQ("OK", MatchError("bad record length")); -} - -TEST(LogTest, ChecksumMismatch) { - Write("foo"); - IncrementByte(0, 10); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(10, DroppedBytes()); - ASSERT_EQ("OK", MatchError("checksum mismatch")); -} - -TEST(LogTest, UnexpectedMiddleType) { - Write("foo"); - SetByte(6, kMiddleType); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("missing start")); -} - -TEST(LogTest, UnexpectedLastType) { - Write("foo"); - SetByte(6, kLastType); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("missing start")); -} - -TEST(LogTest, UnexpectedFullType) { - Write("foo"); - Write("bar"); - SetByte(6, kFirstType); - FixChecksum(0, 3); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("partial record without end")); -} - -TEST(LogTest, UnexpectedFirstType) { - Write("foo"); - Write(BigString("bar", 100000)); - SetByte(6, kFirstType); - FixChecksum(0, 3); - ASSERT_EQ(BigString("bar", 100000), Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("partial record without end")); -} - -TEST(LogTest, ErrorJoinsRecords) { - // Consider two fragmented records: - // first(R1) last(R1) first(R2) last(R2) - // where the middle two fragments disappear. We do not want - // first(R1),last(R2) to get joined and returned as a valid record. - - // Write records that span two blocks - Write(BigString("foo", kBlockSize)); - Write(BigString("bar", kBlockSize)); - Write("correct"); - - // Wipe the middle block - for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { - SetByte(offset, 'x'); - } - - ASSERT_EQ("correct", Read()); - ASSERT_EQ("EOF", Read()); - const int dropped = DroppedBytes(); - ASSERT_LE(dropped, 2*kBlockSize + 100); - ASSERT_GE(dropped, 2*kBlockSize); -} - -} -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/log_writer.cc b/leveldb/db/log_writer.cc deleted file mode 100644 index 1696851..0000000 --- a/leveldb/db/log_writer.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_writer.h" - -#include -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { -namespace log { - -Writer::Writer(WritableFile* dest) - : dest_(dest), - block_offset_(0) { - for (int i = 0; i <= kMaxRecordType; i++) { - char t = static_cast(i); - type_crc_[i] = crc32c::Value(&t, 1); - } -} - -Writer::~Writer() { -} - -Status Writer::AddRecord(const Slice& slice) { - const char* ptr = slice.data(); - size_t left = slice.size(); - - // Fragment the record if necessary and emit it. Note that if slice - // is empty, we still want to iterate once to emit a single - // zero-length record - Status s; - do { - const int leftover = kBlockSize - block_offset_; - assert(leftover >= 0); - if (leftover < kHeaderSize) { - // Switch to a new block - if (leftover > 0) { - // Fill the trailer (literal below relies on kHeaderSize being 7) - assert(kHeaderSize == 7); - dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); - } - block_offset_ = 0; - } - - // Invariant: we never leave < kHeaderSize bytes in a block. - assert(kBlockSize - block_offset_ - kHeaderSize >= 0); - - const size_t avail = kBlockSize - block_offset_ - kHeaderSize; - const size_t fragment_length = (left < avail) ? left : avail; - - RecordType type; - const bool begin = (ptr == slice.data()); - const bool end = (left == fragment_length); - if (begin && end) { - type = kFullType; - } else if (begin) { - type = kFirstType; - } else if (end) { - type = kLastType; - } else { - type = kMiddleType; - } - - s = EmitPhysicalRecord(type, ptr, fragment_length); - ptr += fragment_length; - left -= fragment_length; - } while (s.ok() && left > 0); - return s; -} - -Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { - assert(n <= 0xffff); // Must fit in two bytes - assert(block_offset_ + kHeaderSize + n <= kBlockSize); - - // Format the header - char buf[kHeaderSize]; - buf[4] = static_cast(n & 0xff); - buf[5] = static_cast(n >> 8); - buf[6] = static_cast(t); - - // Compute the crc of the record type and the payload. - uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); - crc = crc32c::Mask(crc); // Adjust for storage - EncodeFixed32(buf, crc); - - // Write the header and the payload - Status s = dest_->Append(Slice(buf, kHeaderSize)); - if (s.ok()) { - s = dest_->Append(Slice(ptr, n)); - if (s.ok()) { - s = dest_->Flush(); - } - } - block_offset_ += kHeaderSize + n; - return s; -} - -} -} diff --git a/leveldb/db/log_writer.h b/leveldb/db/log_writer.h deleted file mode 100644 index d3cf27d..0000000 --- a/leveldb/db/log_writer.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ -#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ - -#include -#include "db/log_format.h" -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class WritableFile; - -namespace log { - -class Writer { - public: - // Create a writer that will append data to "*dest". - // "*dest" must be initially empty. - // "*dest" must remain live while this Writer is in use. - explicit Writer(WritableFile* dest); - ~Writer(); - - Status AddRecord(const Slice& slice); - - private: - WritableFile* dest_; - int block_offset_; // Current offset in block - - // crc32c values for all supported record types. These are - // pre-computed to reduce the overhead of computing the crc of the - // record type stored in the header. - uint32_t type_crc_[kMaxRecordType + 1]; - - Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); - - // No copying allowed - Writer(const Writer&); - void operator=(const Writer&); -}; - -} -} - -#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/leveldb/db/memtable.cc b/leveldb/db/memtable.cc deleted file mode 100644 index a3b618a..0000000 --- a/leveldb/db/memtable.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/memtable.h" -#include "db/dbformat.h" -#include "leveldb/comparator.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "util/coding.h" - -namespace leveldb { - -static Slice GetLengthPrefixedSlice(const char* data) { - uint32_t len; - const char* p = data; - p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted - return Slice(p, len); -} - -MemTable::MemTable(const InternalKeyComparator& cmp) - : comparator_(cmp), - table_(comparator_, &arena_) { -} - -MemTable::~MemTable() { -} - -size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } - -int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) - const { - // Internal keys are encoded as length-prefixed strings. - Slice a = GetLengthPrefixedSlice(aptr); - Slice b = GetLengthPrefixedSlice(bptr); - return comparator.Compare(a, b); -} - -// Encode a suitable internal key target for "target" and return it. -// Uses *scratch as scratch space, and the returned pointer will point -// into this scratch space. -static const char* EncodeKey(std::string* scratch, const Slice& target) { - scratch->clear(); - PutVarint32(scratch, target.size()); - scratch->append(target.data(), target.size()); - return scratch->data(); -} - -class MemTableIterator: public Iterator { - public: - explicit MemTableIterator(MemTable::Table* table) { - iter_ = new MemTable::Table::Iterator(table); - } - virtual ~MemTableIterator() { delete iter_; } - - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } - virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } - virtual Slice value() const { - Slice key_slice = GetLengthPrefixedSlice(iter_->key()); - return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); - } - - virtual Status status() const { return Status::OK(); } - - private: - MemTable::Table::Iterator* iter_; - std::string tmp_; // For passing to EncodeKey - - // No copying allowed - MemTableIterator(const MemTableIterator&); - void operator=(const MemTableIterator&); -}; - -Iterator* MemTable::NewIterator() { - return new MemTableIterator(&table_); -} - -void MemTable::Add(SequenceNumber s, ValueType type, - const Slice& key, - const Slice& value) { - // Format of an entry is concatenation of: - // key_size : varint32 of internal_key.size() - // key bytes : char[internal_key.size()] - // value_size : varint32 of value.size() - // value bytes : char[value.size()] - size_t key_size = key.size(); - size_t val_size = value.size(); - size_t internal_key_size = key_size + 8; - const size_t encoded_len = - VarintLength(internal_key_size) + internal_key_size + - VarintLength(val_size) + val_size; - char* buf = arena_.Allocate(encoded_len); - char* p = EncodeVarint32(buf, internal_key_size); - memcpy(p, key.data(), key_size); - p += key_size; - EncodeFixed64(p, (s << 8) | type); - p += 8; - p = EncodeVarint32(p, val_size); - memcpy(p, value.data(), val_size); - assert((p + val_size) - buf == encoded_len); - table_.Insert(buf); -} - -} diff --git a/leveldb/db/memtable.h b/leveldb/db/memtable.h deleted file mode 100644 index 45b3342..0000000 --- a/leveldb/db/memtable.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ -#define STORAGE_LEVELDB_DB_MEMTABLE_H_ - -#include -#include "leveldb/db.h" -#include "db/dbformat.h" -#include "db/skiplist.h" -#include "util/arena.h" - -namespace leveldb { - -class InternalKeyComparator; -class Mutex; -class MemTableIterator; - -class MemTable { - public: - explicit MemTable(const InternalKeyComparator& comparator); - ~MemTable(); - - // Returns an estimate of the number of bytes of data in use by this - // data structure. - // - // REQUIRES: external synchronization to prevent simultaneous - // operations on the same MemTable. - size_t ApproximateMemoryUsage(); - - // Return an iterator that yields the contents of the memtable. - // - // The caller must ensure that the underlying MemTable remains live - // while the returned iterator is live. The keys returned by this - // iterator are internal keys encoded by AppendInternalKey in the - // db/format.{h,cc} module. - Iterator* NewIterator(); - - // Add an entry into memtable that maps key to value at the - // specified sequence number and with the specified type. - // Typically value will be empty if type==kTypeDeletion. - void Add(SequenceNumber seq, ValueType type, - const Slice& key, - const Slice& value); - - private: - struct KeyComparator { - const InternalKeyComparator comparator; - explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } - int operator()(const char* a, const char* b) const; - }; - friend class MemTableIterator; - friend class MemTableBackwardIterator; - - typedef SkipList Table; - - KeyComparator comparator_; - Arena arena_; - Table table_; - - // No copying allowed - MemTable(const MemTable&); - void operator=(const MemTable&); -}; - -} - -#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/leveldb/db/repair.cc b/leveldb/db/repair.cc deleted file mode 100644 index c8e7b9e..0000000 --- a/leveldb/db/repair.cc +++ /dev/null @@ -1,380 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// We recover the contents of the descriptor from the other files we find. -// (1) Any log files are first converted to tables -// (2) We scan every table to compute -// (a) smallest/largest for the table -// (b) largest sequence number in the table -// (3) We generate descriptor contents: -// - log number is set to zero -// - next-file-number is set to 1 + largest file number we found -// - last-sequence-number is set to largest sequence# found across -// all tables (see 2c) -// - compaction pointers are cleared -// - every table file is added at level 0 -// -// Possible optimization 1: -// (a) Compute total size and use to pick appropriate max-level M -// (b) Sort tables by largest sequence# in the table -// (c) For each table: if it overlaps earlier table, place in level-0, -// else place in level-M. -// Possible optimization 2: -// Store per-table metadata (smallest, largest, largest-seq#, ...) -// in the table's meta section to speed up ScanTable. - -#include "db/builder.h" -#include "db/db_impl.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "db/version_edit.h" -#include "db/write_batch_internal.h" -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/env.h" - -namespace leveldb { - -namespace { - -class Repairer { - public: - Repairer(const std::string& dbname, const Options& options) - : dbname_(dbname), - env_(options.env), - icmp_(options.comparator), - options_(SanitizeOptions(dbname, &icmp_, options)), - owns_info_log_(options_.info_log != options.info_log), - next_file_number_(1) { - // TableCache can be small since we expect each table to be opened once. - table_cache_ = new TableCache(dbname_, &options_, 10); - } - - ~Repairer() { - delete table_cache_; - if (owns_info_log_) { - delete options_.info_log; - } - } - - Status Run() { - Status status = FindFiles(); - if (status.ok()) { - ConvertLogFilesToTables(); - ExtractMetaData(); - status = WriteDescriptor(); - } - if (status.ok()) { - unsigned long long bytes = 0; - for (size_t i = 0; i < tables_.size(); i++) { - bytes += tables_[i].meta.file_size; - } - Log(env_, options_.info_log, - "**** Repaired leveldb %s; " - "recovered %d files; %llu bytes. " - "Some data may have been lost. " - "****", - dbname_.c_str(), - static_cast(tables_.size()), - bytes); - } - return status; - } - - private: - struct TableInfo { - FileMetaData meta; - SequenceNumber max_sequence; - }; - - std::string const dbname_; - Env* const env_; - InternalKeyComparator const icmp_; - Options const options_; - bool owns_info_log_; - TableCache* table_cache_; - VersionEdit edit_; - - std::vector manifests_; - std::vector table_numbers_; - std::vector logs_; - std::vector tables_; - uint64_t next_file_number_; - - Status FindFiles() { - std::vector filenames; - Status status = env_->GetChildren(dbname_, &filenames); - if (!status.ok()) { - return status; - } - if (filenames.empty()) { - return Status::IOError(dbname_, "repair found no files"); - } - - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - if (type == kDescriptorFile) { - manifests_.push_back(filenames[i]); - } else { - if (number + 1 > next_file_number_) { - next_file_number_ = number + 1; - } - if (type == kLogFile) { - logs_.push_back(number); - } else if (type == kTableFile) { - table_numbers_.push_back(number); - } else { - // Ignore other files - } - } - } - } - return status; - } - - void ConvertLogFilesToTables() { - for (size_t i = 0; i < logs_.size(); i++) { - std::string logname = LogFileName(dbname_, logs_[i]); - Status status = ConvertLogToTable(logs_[i]); - if (!status.ok()) { - Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", - (unsigned long long) logs_[i], - status.ToString().c_str()); - } - ArchiveFile(logname); - } - } - - Status ConvertLogToTable(uint64_t log) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - WritableFile* info_log; - uint64_t lognum; - virtual void Corruption(size_t bytes, const Status& s) { - // We print error messages for corruption, but continue repairing. - Log(env, info_log, "Log #%llu: dropping %d bytes; %s", - (unsigned long long) lognum, - static_cast(bytes), - s.ToString().c_str()); - } - }; - - // Open the log file - std::string logname = LogFileName(dbname_, log); - SequentialFile* lfile; - Status status = env_->NewSequentialFile(logname, &lfile); - if (!status.ok()) { - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log; - reporter.lognum = log; - // We intentially make log::Reader do checksumming so that - // corruptions cause entire commits to be skipped instead of - // propagating bad information (like overly large sequence - // numbers). - log::Reader reader(lfile, &reporter, false/*do not checksum*/); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - MemTable mem(icmp_); - int counter = 0; - while (reader.ReadRecord(&record, &scratch)) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, &mem); - if (status.ok()) { - counter += WriteBatchInternal::Count(&batch); - } else { - Log(env_, options_.info_log, "Log #%llu: ignoring %s", - (unsigned long long) log, - status.ToString().c_str()); - status = Status::OK(); // Keep going with rest of file - } - } - delete lfile; - - // We ignore any version edits generated by the conversion to a Table - // since ExtractMetaData() will also generate edits. - VersionEdit skipped; - FileMetaData meta; - meta.number = next_file_number_++; - Iterator* iter = mem.NewIterator(); - status = BuildTable(dbname_, env_, options_, table_cache_, iter, - &meta, &skipped); - delete iter; - if (status.ok()) { - if (meta.file_size > 0) { - table_numbers_.push_back(meta.number); - } - } - Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", - (unsigned long long) log, - counter, - (unsigned long long) meta.number, - status.ToString().c_str()); - return status; - } - - void ExtractMetaData() { - std::vector kept; - for (size_t i = 0; i < table_numbers_.size(); i++) { - TableInfo t; - t.meta.number = table_numbers_[i]; - Status status = ScanTable(&t); - if (!status.ok()) { - std::string fname = TableFileName(dbname_, table_numbers_[i]); - Log(env_, options_.info_log, "Table #%llu: ignoring %s", - (unsigned long long) table_numbers_[i], - status.ToString().c_str()); - ArchiveFile(fname); - } else { - tables_.push_back(t); - } - } - } - - Status ScanTable(TableInfo* t) { - std::string fname = TableFileName(dbname_, t->meta.number); - int counter = 0; - Status status = env_->GetFileSize(fname, &t->meta.file_size); - if (status.ok()) { - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), t->meta.number, t->meta.file_size); - bool empty = true; - ParsedInternalKey parsed; - t->max_sequence = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - Slice key = iter->key(); - if (!ParseInternalKey(key, &parsed)) { - Log(env_, options_.info_log, "Table #%llu: unparsable key %s", - (unsigned long long) t->meta.number, - EscapeString(key).c_str()); - continue; - } - - counter++; - if (empty) { - empty = false; - t->meta.smallest.DecodeFrom(key); - } - t->meta.largest.DecodeFrom(key); - if (parsed.sequence > t->max_sequence) { - t->max_sequence = parsed.sequence; - } - } - if (!iter->status().ok()) { - status = iter->status(); - } - delete iter; - } - Log(env_, options_.info_log, "Table #%llu: %d entries %s", - (unsigned long long) t->meta.number, - counter, - status.ToString().c_str()); - return status; - } - - Status WriteDescriptor() { - std::string tmp = TempFileName(dbname_, 1); - WritableFile* file; - Status status = env_->NewWritableFile(tmp, &file); - if (!status.ok()) { - return status; - } - - SequenceNumber max_sequence = 0; - for (size_t i = 0; i < tables_.size(); i++) { - if (max_sequence < tables_[i].max_sequence) { - max_sequence = tables_[i].max_sequence; - } - } - - edit_.SetComparatorName(icmp_.user_comparator()->Name()); - edit_.SetLogNumber(0); - edit_.SetNextFile(next_file_number_); - edit_.SetLastSequence(max_sequence); - - for (size_t i = 0; i < tables_.size(); i++) { - // TODO(opt): separate out into multiple levels - const TableInfo& t = tables_[i]; - edit_.AddFile(0, t.meta.number, t.meta.file_size, - t.meta.smallest, t.meta.largest); - } - - //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); - { - log::Writer log(file); - std::string record; - edit_.EncodeTo(&record); - status = log.AddRecord(record); - } - if (status.ok()) { - status = file->Close(); - } - delete file; - file = NULL; - - if (!status.ok()) { - env_->DeleteFile(tmp); - } else { - // Discard older manifests - for (size_t i = 0; i < manifests_.size(); i++) { - ArchiveFile(dbname_ + "/" + manifests_[i]); - } - - // Install new manifest - status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); - if (status.ok()) { - status = SetCurrentFile(env_, dbname_, 1); - } else { - env_->DeleteFile(tmp); - } - } - return status; - } - - void ArchiveFile(const std::string& fname) { - // Move into another directory. E.g., for - // dir/foo - // rename to - // dir/lost/foo - const char* slash = strrchr(fname.c_str(), '/'); - std::string new_dir; - if (slash != NULL) { - new_dir.assign(fname.data(), slash - fname.data()); - } - new_dir.append("/lost"); - env_->CreateDir(new_dir); // Ignore error - std::string new_file = new_dir; - new_file.append("/"); - new_file.append((slash == NULL) ? fname.c_str() : slash + 1); - Status s = env_->RenameFile(fname, new_file); - Log(env_, options_.info_log, "Archiving %s: %s\n", - fname.c_str(), s.ToString().c_str()); - } -}; -} - -Status RepairDB(const std::string& dbname, const Options& options) { - Repairer repairer(dbname, options); - return repairer.Run(); -} - -} diff --git a/leveldb/db/skiplist.h b/leveldb/db/skiplist.h deleted file mode 100644 index be39354..0000000 --- a/leveldb/db/skiplist.h +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Thread safety -// ------------- -// -// Writes require external synchronization, most likely a mutex. -// Reads require a guarantee that the SkipList will not be destroyed -// while the read is in progress. Apart from that, reads progress -// without any internal locking or synchronization. -// -// Invariants: -// -// (1) Allocated nodes are never deleted until the SkipList is -// destroyed. This is trivially guaranteed by the code since we -// never delete any skip list nodes. -// -// (2) The contents of a Node except for the next/prev pointers are -// immutable after the Node has been linked into the SkipList. -// Only Insert() modifies the list, and it is careful to initialize -// a node and use release-stores to publish the nodes in one or -// more lists. -// -// ... prev vs. next pointer ordering ... - -#include -#include -#include "port/port.h" -#include "util/arena.h" -#include "util/random.h" - -namespace leveldb { - -class Arena; - -template -class SkipList { - private: - struct Node; - - public: - // Create a new SkipList object that will use "cmp" for comparing keys, - // and will allocate memory using "*arena". Objects allocated in the arena - // must remain allocated for the lifetime of the skiplist object. - explicit SkipList(Comparator cmp, Arena* arena); - - // Insert key into the list. - // REQUIRES: nothing that compares equal to key is currently in the list. - void Insert(const Key& key); - - // Returns true iff an entry that compares equal to key is in the list. - bool Contains(const Key& key) const; - - // Iteration over the contents of a skip list - class Iterator { - public: - // Initialize an iterator over the specified list. - // The returned iterator is not valid. - explicit Iterator(const SkipList* list); - - // Returns true iff the iterator is positioned at a valid node. - bool Valid() const; - - // Returns the key at the current position. - // REQUIRES: Valid() - const Key& key() const; - - // Advances to the next position. - // REQUIRES: Valid() - void Next(); - - // Advances to the previous position. - // REQUIRES: Valid() - void Prev(); - - // Advance to the first entry with a key >= target - void Seek(const Key& target); - - // Position at the first entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToFirst(); - - // Position at the last entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToLast(); - - private: - const SkipList* list_; - Node* node_; - // Intentionally copyable - }; - - private: - enum { kMaxHeight = 12 }; - - // Immutable after construction - Comparator const compare_; - Arena* const arena_; // Arena used for allocations of nodes - - Node* const head_; - - // Modified only by Insert(). Read racily by readers, but stale - // values are ok. - port::AtomicPointer max_height_; // Height of the entire list - - inline int GetMaxHeight() const { - return reinterpret_cast(max_height_.NoBarrier_Load()); - } - - // Read/written only by Insert(). - Random rnd_; - - Node* NewNode(const Key& key, int height); - int RandomHeight(); - bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } - - // Return true if key is greater than the data stored in "n" - bool KeyIsAfterNode(const Key& key, Node* n) const; - - // Return the earliest node that comes at or after key. - // Return NULL if there is no such node. - // - // If prev is non-NULL, fills prev[level] with pointer to previous - // node at "level" for every level in [0..max_height_-1]. - Node* FindGreaterOrEqual(const Key& key, Node** prev) const; - - // Return the latest node with a key < key. - // Return head_ if there is no such node. - Node* FindLessThan(const Key& key) const; - - // Return the last node in the list. - // Return head_ if list is empty. - Node* FindLast() const; - - // No copying allowed - SkipList(const SkipList&); - void operator=(const SkipList&); -}; - -// Implementation details follow -template -struct SkipList::Node { - explicit Node(const Key& k) : key(k) { } - - Key const key; - - // Accessors/mutators for links. Wrapped in methods so we can - // add the appropriate barriers as necessary. - Node* Next(int n) { - assert(n >= 0); - // Use an 'acquire load' so that we observe a fully initialized - // version of the returned Node. - return reinterpret_cast(next_[n].Acquire_Load()); - } - void SetNext(int n, Node* x) { - assert(n >= 0); - // Use a 'release store' so that anybody who reads through this - // pointer observes a fully initialized version of the inserted node. - next_[n].Release_Store(x); - } - - // No-barrier variants that can be safely used in a few locations. - Node* NoBarrier_Next(int n) { - assert(n >= 0); - return reinterpret_cast(next_[n].NoBarrier_Load()); - } - void NoBarrier_SetNext(int n, Node* x) { - assert(n >= 0); - next_[n].NoBarrier_Store(x); - } - - private: - // Array of length equal to the node height. next_[0] is lowest level link. - port::AtomicPointer next_[1]; -}; - -template -typename SkipList::Node* -SkipList::NewNode(const Key& key, int height) { - char* mem = arena_->AllocateAligned( - sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); - return new (mem) Node(key); -} - -template -inline SkipList::Iterator::Iterator(const SkipList* list) { - list_ = list; - node_ = NULL; -} - -template -inline bool SkipList::Iterator::Valid() const { - return node_ != NULL; -} - -template -inline const Key& SkipList::Iterator::key() const { - assert(Valid()); - return node_->key; -} - -template -inline void SkipList::Iterator::Next() { - assert(Valid()); - node_ = node_->Next(0); -} - -template -inline void SkipList::Iterator::Prev() { - // Instead of using explicit "prev" links, we just search for the - // last node that falls before key. - assert(Valid()); - node_ = list_->FindLessThan(node_->key); - if (node_ == list_->head_) { - node_ = NULL; - } -} - -template -inline void SkipList::Iterator::Seek(const Key& target) { - node_ = list_->FindGreaterOrEqual(target, NULL); -} - -template -inline void SkipList::Iterator::SeekToFirst() { - node_ = list_->head_->Next(0); -} - -template -inline void SkipList::Iterator::SeekToLast() { - node_ = list_->FindLast(); - if (node_ == list_->head_) { - node_ = NULL; - } -} - -template -int SkipList::RandomHeight() { - // Increase height with probability 1 in kBranching - static const unsigned int kBranching = 4; - int height = 1; - while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { - height++; - } - assert(height > 0); - assert(height <= kMaxHeight); - return height; -} - -template -bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { - // NULL n is considered infinite - return (n != NULL) && (compare_(n->key, key) < 0); -} - -template -typename SkipList::Node* SkipList::FindGreaterOrEqual(const Key& key, Node** prev) - const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - Node* next = x->Next(level); - if (KeyIsAfterNode(key, next)) { - // Keep searching in this list - x = next; - } else { - if (prev != NULL) prev[level] = x; - if (level == 0) { - return next; - } else { - // Switch to next list - level--; - } - } - } -} - -template -typename SkipList::Node* -SkipList::FindLessThan(const Key& key) const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - assert(x == head_ || compare_(x->key, key) < 0); - Node* next = x->Next(level); - if (next == NULL || compare_(next->key, key) >= 0) { - if (level == 0) { - return x; - } else { - // Switch to next list - level--; - } - } else { - x = next; - } - } -} - -template -typename SkipList::Node* SkipList::FindLast() - const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - Node* next = x->Next(level); - if (next == NULL) { - if (level == 0) { - return x; - } else { - // Switch to next list - level--; - } - } else { - x = next; - } - } -} - -template -SkipList::SkipList(Comparator cmp, Arena* arena) - : compare_(cmp), - arena_(arena), - head_(NewNode(0 /* any key will do */, kMaxHeight)), - max_height_(reinterpret_cast(1)), - rnd_(0xdeadbeef) { - for (int i = 0; i < kMaxHeight; i++) { - head_->SetNext(i, NULL); - } -} - -template -void SkipList::Insert(const Key& key) { - // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() - // here since Insert() is externally synchronized. - Node* prev[kMaxHeight]; - Node* x = FindGreaterOrEqual(key, prev); - - // Our data structure does not allow duplicate insertion - assert(x == NULL || !Equal(key, x->key)); - - int height = RandomHeight(); - if (height > GetMaxHeight()) { - for (int i = GetMaxHeight(); i < height; i++) { - prev[i] = head_; - } - //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); - - // It is ok to mutate max_height_ without any synchronization - // with concurrent readers. A concurrent reader that observes - // the new value of max_height_ will see either the old value of - // new level pointers from head_ (NULL), or a new value set in - // the loop below. In the former case the reader will - // immediately drop to the next level since NULL sorts after all - // keys. In the latter case the reader will use the new node. - max_height_.NoBarrier_Store(reinterpret_cast(height)); - } - - x = NewNode(key, height); - for (int i = 0; i < height; i++) { - // NoBarrier_SetNext() suffices since we will add a barrier when - // we publish a pointer to "x" in prev[i]. - x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); - prev[i]->SetNext(i, x); - } -} - -template -bool SkipList::Contains(const Key& key) const { - Node* x = FindGreaterOrEqual(key, NULL); - if (x != NULL && Equal(key, x->key)) { - return true; - } else { - return false; - } -} - -} diff --git a/leveldb/db/skiplist_test.cc b/leveldb/db/skiplist_test.cc deleted file mode 100644 index 5f9ec0d..0000000 --- a/leveldb/db/skiplist_test.cc +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/skiplist.h" -#include -#include "leveldb/env.h" -#include "util/arena.h" -#include "util/hash.h" -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { - -typedef uint64_t Key; - -struct Comparator { - int operator()(const Key& a, const Key& b) const { - if (a < b) { - return -1; - } else if (a > b) { - return +1; - } else { - return 0; - } - } -}; - -class SkipTest { }; - -TEST(SkipTest, Empty) { - Arena arena; - Comparator cmp; - SkipList list(cmp, &arena); - ASSERT_TRUE(!list.Contains(10)); - - SkipList::Iterator iter(&list); - ASSERT_TRUE(!iter.Valid()); - iter.SeekToFirst(); - ASSERT_TRUE(!iter.Valid()); - iter.Seek(100); - ASSERT_TRUE(!iter.Valid()); - iter.SeekToLast(); - ASSERT_TRUE(!iter.Valid()); -} - -TEST(SkipTest, InsertAndLookup) { - const int N = 2000; - const int R = 5000; - Random rnd(1000); - std::set keys; - Arena arena; - Comparator cmp; - SkipList list(cmp, &arena); - for (int i = 0; i < N; i++) { - Key key = rnd.Next() % R; - if (keys.insert(key).second) { - list.Insert(key); - } - } - - for (int i = 0; i < R; i++) { - if (list.Contains(i)) { - ASSERT_EQ(keys.count(i), 1); - } else { - ASSERT_EQ(keys.count(i), 0); - } - } - - // Simple iterator tests - { - SkipList::Iterator iter(&list); - ASSERT_TRUE(!iter.Valid()); - - iter.Seek(0); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.begin()), iter.key()); - - iter.SeekToFirst(); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.begin()), iter.key()); - - iter.SeekToLast(); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.rbegin()), iter.key()); - } - - // Forward iteration test - for (int i = 0; i < R; i++) { - SkipList::Iterator iter(&list); - iter.Seek(i); - - // Compare against model iterator - std::set::iterator model_iter = keys.lower_bound(i); - for (int j = 0; j < 3; j++) { - if (model_iter == keys.end()) { - ASSERT_TRUE(!iter.Valid()); - break; - } else { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - ++model_iter; - iter.Next(); - } - } - } - - // Backward iteration test - { - SkipList::Iterator iter(&list); - iter.SeekToLast(); - - // Compare against model iterator - for (std::set::reverse_iterator model_iter = keys.rbegin(); - model_iter != keys.rend(); - ++model_iter) { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - iter.Prev(); - } - ASSERT_TRUE(!iter.Valid()); - } -} - -// We want to make sure that with a single writer and multiple -// concurrent readers (with no synchronization other than when a -// reader's iterator is created), the reader always observes all the -// data that was present in the skip list when the iterator was -// constructor. Because insertions are happening concurrently, we may -// also observe new values that were inserted since the iterator was -// constructed, but we should never miss any values that were present -// at iterator construction time. -// -// We generate multi-part keys: -// -// where: -// key is in range [0..K-1] -// gen is a generation number for key -// hash is hash(key,gen) -// -// The insertion code picks a random key, sets gen to be 1 + the last -// generation number inserted for that key, and sets hash to Hash(key,gen). -// -// At the beginning of a read, we snapshot the last inserted -// generation number for each key. We then iterate, including random -// calls to Next() and Seek(). For every key we encounter, we -// check that it is either expected given the initial snapshot or has -// been concurrently added since the iterator started. -class ConcurrentTest { - private: - static const uint32_t K = 4; - - static uint64_t key(Key key) { return (key >> 40); } - static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } - static uint64_t hash(Key key) { return key & 0xff; } - - static uint64_t HashNumbers(uint64_t k, uint64_t g) { - uint64_t data[2] = { k, g }; - return Hash(reinterpret_cast(data), sizeof(data), 0); - } - - static Key MakeKey(uint64_t k, uint64_t g) { - assert(sizeof(Key) == sizeof(uint64_t)); - assert(k <= K); // We sometimes pass K to seek to the end of the skiplist - assert(g <= 0xffffffffu); - return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); - } - - static bool IsValidKey(Key k) { - return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); - } - - static Key RandomTarget(Random* rnd) { - switch (rnd->Next() % 10) { - case 0: - // Seek to beginning - return MakeKey(0, 0); - case 1: - // Seek to end - return MakeKey(K, 0); - default: - // Seek to middle - return MakeKey(rnd->Next() % K, 0); - } - } - - // Per-key generation - struct State { - port::AtomicPointer generation[K]; - void Set(int k, intptr_t v) { - generation[k].Release_Store(reinterpret_cast(v)); - } - intptr_t Get(int k) { - return reinterpret_cast(generation[k].Acquire_Load()); - } - - State() { - for (int k = 0; k < K; k++) { - Set(k, 0); - } - } - }; - - // Current state of the test - State current_; - - Arena arena_; - - // SkipList is not protected by mu_. We just use a single writer - // thread to modify it. - SkipList list_; - - public: - ConcurrentTest() : list_(Comparator(), &arena_) { } - - // REQUIRES: External synchronization - void WriteStep(Random* rnd) { - const uint32_t k = rnd->Next() % K; - const intptr_t g = current_.Get(k) + 1; - const Key key = MakeKey(k, g); - list_.Insert(key); - current_.Set(k, g); - } - - void ReadStep(Random* rnd) { - // Remember the initial committed state of the skiplist. - State initial_state; - for (int k = 0; k < K; k++) { - initial_state.Set(k, current_.Get(k)); - } - - Key pos = RandomTarget(rnd); - SkipList::Iterator iter(&list_); - iter.Seek(pos); - while (true) { - Key current; - if (!iter.Valid()) { - current = MakeKey(K, 0); - } else { - current = iter.key(); - ASSERT_TRUE(IsValidKey(current)) << std::hex << current; - } - ASSERT_LE(pos, current) << "should not go backwards"; - - // Verify that everything in [pos,current) was not present in - // initial_state. - while (pos < current) { - ASSERT_LT(key(pos), K) << std::hex << pos; - - // Note that generation 0 is never inserted, so it is ok if - // <*,0,*> is missing. - ASSERT_TRUE((gen(pos) == 0) || - (gen(pos) > initial_state.Get(key(pos))) - ) << "key: " << key(pos) - << "; gen: " << gen(pos) - << "; initgen: " - << initial_state.Get(key(pos)); - - // Advance to next key in the valid key space - if (key(pos) < key(current)) { - pos = MakeKey(key(pos) + 1, 0); - } else { - pos = MakeKey(key(pos), gen(pos) + 1); - } - } - - if (!iter.Valid()) { - break; - } - - if (rnd->Next() % 2) { - iter.Next(); - pos = MakeKey(key(pos), gen(pos) + 1); - } else { - Key new_target = RandomTarget(rnd); - if (new_target > pos) { - pos = new_target; - iter.Seek(new_target); - } - } - } - } -}; -const uint32_t ConcurrentTest::K; - -// Simple test that does single-threaded testing of the ConcurrentTest -// scaffolding. -TEST(SkipTest, ConcurrentWithoutThreads) { - ConcurrentTest test; - Random rnd(test::RandomSeed()); - for (int i = 0; i < 10000; i++) { - test.ReadStep(&rnd); - test.WriteStep(&rnd); - } -} - -class TestState { - public: - ConcurrentTest t_; - int seed_; - port::AtomicPointer quit_flag_; - - enum ReaderState { - STARTING, - RUNNING, - DONE - }; - - explicit TestState(int s) - : seed_(s), - quit_flag_(NULL), - state_(STARTING), - state_cv_(&mu_) {} - - void Wait(ReaderState s) { - mu_.Lock(); - while (state_ != s) { - state_cv_.Wait(); - } - mu_.Unlock(); - } - - void Change(ReaderState s) { - mu_.Lock(); - state_ = s; - state_cv_.Signal(); - mu_.Unlock(); - } - - private: - port::Mutex mu_; - ReaderState state_; - port::CondVar state_cv_; -}; - -static void ConcurrentReader(void* arg) { - TestState* state = reinterpret_cast(arg); - Random rnd(state->seed_); - int64_t reads = 0; - state->Change(TestState::RUNNING); - while (!state->quit_flag_.Acquire_Load()) { - state->t_.ReadStep(&rnd); - ++reads; - } - state->Change(TestState::DONE); -} - -static void RunConcurrent(int run) { - const int seed = test::RandomSeed() + (run * 100); - Random rnd(seed); - const int N = 1000; - const int kSize = 1000; - for (int i = 0; i < N; i++) { - if ((i % 100) == 0) { - fprintf(stderr, "Run %d of %d\n", i, N); - } - TestState state(seed + 1); - Env::Default()->Schedule(ConcurrentReader, &state); - state.Wait(TestState::RUNNING); - for (int i = 0; i < kSize; i++) { - state.t_.WriteStep(&rnd); - } - state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do - state.Wait(TestState::DONE); - } -} - -TEST(SkipTest, Concurrent1) { RunConcurrent(1); } -TEST(SkipTest, Concurrent2) { RunConcurrent(2); } -TEST(SkipTest, Concurrent3) { RunConcurrent(3); } -TEST(SkipTest, Concurrent4) { RunConcurrent(4); } -TEST(SkipTest, Concurrent5) { RunConcurrent(5); } - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/snapshot.h b/leveldb/db/snapshot.h deleted file mode 100644 index 9a90756..0000000 --- a/leveldb/db/snapshot.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ -#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ - -#include "leveldb/db.h" - -namespace leveldb { - -class SnapshotList; - -// Snapshots are kept in a doubly-linked list in the DB. -// Each Snapshot corresponds to a particular sequence number. -class Snapshot { - public: - SequenceNumber number_; // const after creation - - private: - friend class SnapshotList; - - // Snapshot is kept in a doubly-linked circular list - Snapshot* prev_; - Snapshot* next_; - - SnapshotList* list_; // just for sanity checks -}; - -class SnapshotList { - public: - SnapshotList() { - list_.prev_ = &list_; - list_.next_ = &list_; - } - - bool empty() const { return list_.next_ == &list_; } - Snapshot* oldest() const { assert(!empty()); return list_.next_; } - Snapshot* newest() const { assert(!empty()); return list_.prev_; } - - const Snapshot* New(SequenceNumber seq) { - Snapshot* s = new Snapshot; - s->number_ = seq; - s->list_ = this; - s->next_ = &list_; - s->prev_ = list_.prev_; - s->prev_->next_ = s; - s->next_->prev_ = s; - return s; - } - - void Delete(const Snapshot* s) { - assert(s->list_ == this); - s->prev_->next_ = s->next_; - s->next_->prev_ = s->prev_; - delete s; - } - - private: - // Dummy head of doubly-linked list of snapshots - Snapshot list_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/leveldb/db/table_cache.cc b/leveldb/db/table_cache.cc deleted file mode 100644 index 325d707..0000000 --- a/leveldb/db/table_cache.cc +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/table_cache.h" - -#include "db/filename.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "util/coding.h" - -namespace leveldb { - -struct TableAndFile { - RandomAccessFile* file; - Table* table; -}; - -static void DeleteEntry(const Slice& key, void* value) { - TableAndFile* tf = reinterpret_cast(value); - delete tf->table; - delete tf->file; - delete tf; -} - -static void UnrefEntry(void* arg1, void* arg2) { - Cache* cache = reinterpret_cast(arg1); - Cache::Handle* h = reinterpret_cast(arg2); - cache->Release(h); -} - -TableCache::TableCache(const std::string& dbname, - const Options* options, - int entries) - : env_(options->env), - dbname_(dbname), - options_(options), - cache_(NewLRUCache(entries)) { -} - -TableCache::~TableCache() { - delete cache_; -} - -Iterator* TableCache::NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr) { - if (tableptr != NULL) { - *tableptr = NULL; - } - - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - Slice key(buf, sizeof(buf)); - Cache::Handle* handle = cache_->Lookup(key); - if (handle == NULL) { - std::string fname = TableFileName(dbname_, file_number); - RandomAccessFile* file = NULL; - Table* table = NULL; - Status s = env_->NewRandomAccessFile(fname, &file); - if (s.ok()) { - s = Table::Open(*options_, file, file_size, &table); - } - - if (!s.ok()) { - assert(table == NULL); - delete file; - // We do not cache error results so that if the error is transient, - // or somebody repairs the file, we recover automatically. - return NewErrorIterator(s); - } - - TableAndFile* tf = new TableAndFile; - tf->file = file; - tf->table = table; - handle = cache_->Insert(key, tf, 1, &DeleteEntry); - } - - Table* table = reinterpret_cast(cache_->Value(handle))->table; - Iterator* result = table->NewIterator(options); - result->RegisterCleanup(&UnrefEntry, cache_, handle); - if (tableptr != NULL) { - *tableptr = table; - } - return result; -} - -void TableCache::Evict(uint64_t file_number) { - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - cache_->Erase(Slice(buf, sizeof(buf))); -} - -} diff --git a/leveldb/db/table_cache.h b/leveldb/db/table_cache.h deleted file mode 100644 index 5376194..0000000 --- a/leveldb/db/table_cache.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Thread-safe (provides internal synchronization) - -#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ -#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ - -#include -#include -#include "db/dbformat.h" -#include "leveldb/cache.h" -#include "leveldb/table.h" -#include "port/port.h" - -namespace leveldb { - -class Env; - -class TableCache { - public: - TableCache(const std::string& dbname, const Options* options, int entries); - ~TableCache(); - - // Return an iterator for the specified file number (the corresponding - // file length must be exactly "file_size" bytes). If "tableptr" is - // non-NULL, also sets "*tableptr" to point to the Table object - // underlying the returned iterator, or NULL if no Table object underlies - // the returned iterator. The returned "*tableptr" object is owned by - // the cache and should not be deleted, and is valid for as long as the - // returned iterator is live. - Iterator* NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr = NULL); - - // Evict any entry for the specified file number - void Evict(uint64_t file_number); - - private: - Env* const env_; - const std::string dbname_; - const Options* options_; - Cache* cache_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/leveldb/db/version_edit.cc b/leveldb/db/version_edit.cc deleted file mode 100644 index 3941271..0000000 --- a/leveldb/db/version_edit.cc +++ /dev/null @@ -1,268 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_edit.h" - -#include "db/version_set.h" -#include "util/coding.h" - -namespace leveldb { - -// Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. -enum Tag { - kComparator = 1, - kLogNumber = 2, - kNextFileNumber = 3, - kLastSequence = 4, - kCompactPointer = 5, - kDeletedFile = 6, - kNewFile = 7, - // 8 was used for large value refs - kPrevLogNumber = 9, -}; - -void VersionEdit::Clear() { - comparator_.clear(); - log_number_ = 0; - prev_log_number_ = 0; - last_sequence_ = 0; - next_file_number_ = 0; - has_comparator_ = false; - has_log_number_ = false; - has_prev_log_number_ = false; - has_next_file_number_ = false; - has_last_sequence_ = false; - deleted_files_.clear(); - new_files_.clear(); -} - -void VersionEdit::EncodeTo(std::string* dst) const { - if (has_comparator_) { - PutVarint32(dst, kComparator); - PutLengthPrefixedSlice(dst, comparator_); - } - if (has_log_number_) { - PutVarint32(dst, kLogNumber); - PutVarint64(dst, log_number_); - } - if (has_prev_log_number_) { - PutVarint32(dst, kPrevLogNumber); - PutVarint64(dst, prev_log_number_); - } - if (has_next_file_number_) { - PutVarint32(dst, kNextFileNumber); - PutVarint64(dst, next_file_number_); - } - if (has_last_sequence_) { - PutVarint32(dst, kLastSequence); - PutVarint64(dst, last_sequence_); - } - - for (size_t i = 0; i < compact_pointers_.size(); i++) { - PutVarint32(dst, kCompactPointer); - PutVarint32(dst, compact_pointers_[i].first); // level - PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); - } - - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { - PutVarint32(dst, kDeletedFile); - PutVarint32(dst, iter->first); // level - PutVarint64(dst, iter->second); // file number - } - - for (size_t i = 0; i < new_files_.size(); i++) { - const FileMetaData& f = new_files_[i].second; - PutVarint32(dst, kNewFile); - PutVarint32(dst, new_files_[i].first); // level - PutVarint64(dst, f.number); - PutVarint64(dst, f.file_size); - PutLengthPrefixedSlice(dst, f.smallest.Encode()); - PutLengthPrefixedSlice(dst, f.largest.Encode()); - } -} - -static bool GetInternalKey(Slice* input, InternalKey* dst) { - Slice str; - if (GetLengthPrefixedSlice(input, &str)) { - dst->DecodeFrom(str); - return true; - } else { - return false; - } -} - -static bool GetLevel(Slice* input, int* level) { - uint32_t v; - if (GetVarint32(input, &v) && - v < config::kNumLevels) { - *level = v; - return true; - } else { - return false; - } -} - -Status VersionEdit::DecodeFrom(const Slice& src) { - Clear(); - Slice input = src; - const char* msg = NULL; - uint32_t tag; - - // Temporary storage for parsing - int level; - uint64_t number; - FileMetaData f; - Slice str; - InternalKey key; - - while (msg == NULL && GetVarint32(&input, &tag)) { - switch (tag) { - case kComparator: - if (GetLengthPrefixedSlice(&input, &str)) { - comparator_ = str.ToString(); - has_comparator_ = true; - } else { - msg = "comparator name"; - } - break; - - case kLogNumber: - if (GetVarint64(&input, &log_number_)) { - has_log_number_ = true; - } else { - msg = "log number"; - } - break; - - case kPrevLogNumber: - if (GetVarint64(&input, &prev_log_number_)) { - has_prev_log_number_ = true; - } else { - msg = "previous log number"; - } - break; - - case kNextFileNumber: - if (GetVarint64(&input, &next_file_number_)) { - has_next_file_number_ = true; - } else { - msg = "next file number"; - } - break; - - case kLastSequence: - if (GetVarint64(&input, &last_sequence_)) { - has_last_sequence_ = true; - } else { - msg = "last sequence number"; - } - break; - - case kCompactPointer: - if (GetLevel(&input, &level) && - GetInternalKey(&input, &key)) { - compact_pointers_.push_back(std::make_pair(level, key)); - } else { - msg = "compaction pointer"; - } - break; - - case kDeletedFile: - if (GetLevel(&input, &level) && - GetVarint64(&input, &number)) { - deleted_files_.insert(std::make_pair(level, number)); - } else { - msg = "deleted file"; - } - break; - - case kNewFile: - if (GetLevel(&input, &level) && - GetVarint64(&input, &f.number) && - GetVarint64(&input, &f.file_size) && - GetInternalKey(&input, &f.smallest) && - GetInternalKey(&input, &f.largest)) { - new_files_.push_back(std::make_pair(level, f)); - } else { - msg = "new-file entry"; - } - break; - - default: - msg = "unknown tag"; - break; - } - } - - if (msg == NULL && !input.empty()) { - msg = "invalid tag"; - } - - Status result; - if (msg != NULL) { - result = Status::Corruption("VersionEdit", msg); - } - return result; -} - -std::string VersionEdit::DebugString() const { - std::string r; - r.append("VersionEdit {"); - if (has_comparator_) { - r.append("\n Comparator: "); - r.append(comparator_); - } - if (has_log_number_) { - r.append("\n LogNumber: "); - AppendNumberTo(&r, log_number_); - } - if (has_prev_log_number_) { - r.append("\n PrevLogNumber: "); - AppendNumberTo(&r, prev_log_number_); - } - if (has_next_file_number_) { - r.append("\n NextFile: "); - AppendNumberTo(&r, next_file_number_); - } - if (has_last_sequence_) { - r.append("\n LastSeq: "); - AppendNumberTo(&r, last_sequence_); - } - for (size_t i = 0; i < compact_pointers_.size(); i++) { - r.append("\n CompactPointer: "); - AppendNumberTo(&r, compact_pointers_[i].first); - r.append(" '"); - AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); - r.append("'"); - } - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { - r.append("\n DeleteFile: "); - AppendNumberTo(&r, iter->first); - r.append(" "); - AppendNumberTo(&r, iter->second); - } - for (size_t i = 0; i < new_files_.size(); i++) { - const FileMetaData& f = new_files_[i].second; - r.append("\n AddFile: "); - AppendNumberTo(&r, new_files_[i].first); - r.append(" "); - AppendNumberTo(&r, f.number); - r.append(" "); - AppendNumberTo(&r, f.file_size); - r.append(" '"); - AppendEscapedStringTo(&r, f.smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, f.largest.Encode()); - r.append("'"); - } - r.append("\n}\n"); - return r; -} - -} diff --git a/leveldb/db/version_edit.h b/leveldb/db/version_edit.h deleted file mode 100644 index ab874da..0000000 --- a/leveldb/db/version_edit.h +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ -#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ - -#include -#include -#include -#include "db/dbformat.h" - -namespace leveldb { - -class VersionSet; - -struct FileMetaData { - int refs; - uint64_t number; - uint64_t file_size; // File size in bytes - InternalKey smallest; // Smallest internal key served by table - InternalKey largest; // Largest internal key served by table - - FileMetaData() : refs(0), file_size(0) { } -}; - -class VersionEdit { - public: - VersionEdit() { Clear(); } - ~VersionEdit() { } - - void Clear(); - - void SetComparatorName(const Slice& name) { - has_comparator_ = true; - comparator_ = name.ToString(); - } - void SetLogNumber(uint64_t num) { - has_log_number_ = true; - log_number_ = num; - } - void SetPrevLogNumber(uint64_t num) { - has_prev_log_number_ = true; - prev_log_number_ = num; - } - void SetNextFile(uint64_t num) { - has_next_file_number_ = true; - next_file_number_ = num; - } - void SetLastSequence(SequenceNumber seq) { - has_last_sequence_ = true; - last_sequence_ = seq; - } - void SetCompactPointer(int level, const InternalKey& key) { - compact_pointers_.push_back(std::make_pair(level, key)); - } - - // Add the specified file at the specified number. - // REQUIRES: This version has not been saved (see VersionSet::SaveTo) - // REQUIRES: "smallest" and "largest" are smallest and largest keys in file - void AddFile(int level, uint64_t file, - uint64_t file_size, - const InternalKey& smallest, - const InternalKey& largest) { - FileMetaData f; - f.number = file; - f.file_size = file_size; - f.smallest = smallest; - f.largest = largest; - new_files_.push_back(std::make_pair(level, f)); - } - - // Delete the specified "file" from the specified "level". - void DeleteFile(int level, uint64_t file) { - deleted_files_.insert(std::make_pair(level, file)); - } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(const Slice& src); - - std::string DebugString() const; - - private: - friend class VersionSet; - - typedef std::set< std::pair > DeletedFileSet; - - std::string comparator_; - uint64_t log_number_; - uint64_t prev_log_number_; - uint64_t next_file_number_; - SequenceNumber last_sequence_; - bool has_comparator_; - bool has_log_number_; - bool has_prev_log_number_; - bool has_next_file_number_; - bool has_last_sequence_; - - std::vector< std::pair > compact_pointers_; - DeletedFileSet deleted_files_; - std::vector< std::pair > new_files_; -}; - -} - -#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/leveldb/db/version_edit_test.cc b/leveldb/db/version_edit_test.cc deleted file mode 100644 index 67959f7..0000000 --- a/leveldb/db/version_edit_test.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_edit.h" -#include "util/testharness.h" - -namespace leveldb { - -static void TestEncodeDecode(const VersionEdit& edit) { - std::string encoded, encoded2; - edit.EncodeTo(&encoded); - VersionEdit parsed; - Status s = parsed.DecodeFrom(encoded); - ASSERT_TRUE(s.ok()) << s.ToString(); - parsed.EncodeTo(&encoded2); - ASSERT_EQ(encoded, encoded2); -} - -class VersionEditTest { }; - -TEST(VersionEditTest, EncodeDecode) { - static const uint64_t kBig = 1ull << 50; - - VersionEdit edit; - for (int i = 0; i < 4; i++) { - TestEncodeDecode(edit); - edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, - InternalKey("foo", kBig + 500 + i, kTypeValue), - InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); - edit.DeleteFile(4, kBig + 700 + i); - edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); - } - - edit.SetComparatorName("foo"); - edit.SetLogNumber(kBig + 100); - edit.SetNextFile(kBig + 200); - edit.SetLastSequence(kBig + 1000); - TestEncodeDecode(edit); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/db/version_set.cc b/leveldb/db/version_set.cc deleted file mode 100644 index c439f49..0000000 --- a/leveldb/db/version_set.cc +++ /dev/null @@ -1,1027 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_set.h" - -#include -#include -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "leveldb/env.h" -#include "leveldb/table_builder.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -static const int kTargetFileSize = 2 * 1048576; - -// Maximum bytes of overlaps in grandparent (i.e., level+2) before we -// stop building a single file in a level->level+1 compaction. -static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; - -static double MaxBytesForLevel(int level) { - // Note: the result for level zero is not really used since we set - // the level-0 compaction threshold based on number of files. - double result = 10 * 1048576.0; // Result for both level-0 and level-1 - while (level > 1) { - result *= 10; - level--; - } - return result; -} - -static uint64_t MaxFileSizeForLevel(int level) { - return kTargetFileSize; // We could vary per level to reduce number of files? -} - -namespace { -std::string IntSetToString(const std::set& s) { - std::string result = "{"; - for (std::set::const_iterator it = s.begin(); - it != s.end(); - ++it) { - result += (result.size() > 1) ? "," : ""; - result += NumberToString(*it); - } - result += "}"; - return result; -} -} - -Version::~Version() { - assert(refs_ == 0); - for (int level = 0; level < config::kNumLevels; level++) { - for (size_t i = 0; i < files_[level].size(); i++) { - FileMetaData* f = files_[level][i]; - assert(f->refs >= 0); - f->refs--; - if (f->refs <= 0) { - delete f; - } - } - } - delete cleanup_mem_; -} - -// An internal iterator. For a given version/level pair, yields -// information about the files in the level. For a given entry, key() -// is the largest key that occurs in the file, and value() is an -// 16-byte value containing the file number and file size, both -// encoded using EncodeFixed64. -class Version::LevelFileNumIterator : public Iterator { - public: - LevelFileNumIterator(const Version* version, - const std::vector* flist) - : icmp_(version->vset_->icmp_.user_comparator()), - flist_(flist), - index_(flist->size()) { // Marks as invalid - } - virtual bool Valid() const { - return index_ < flist_->size(); - } - virtual void Seek(const Slice& target) { - uint32_t left = 0; - uint32_t right = flist_->size() - 1; - while (left < right) { - uint32_t mid = (left + right) / 2; - int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target); - if (cmp < 0) { - // Key at "mid.largest" is < than "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - index_ = left; - } - virtual void SeekToFirst() { index_ = 0; } - virtual void SeekToLast() { - index_ = flist_->empty() ? 0 : flist_->size() - 1; - } - virtual void Next() { - assert(Valid()); - index_++; - } - virtual void Prev() { - assert(Valid()); - if (index_ == 0) { - index_ = flist_->size(); // Marks as invalid - } else { - index_--; - } - } - Slice key() const { - assert(Valid()); - return (*flist_)[index_]->largest.Encode(); - } - Slice value() const { - assert(Valid()); - EncodeFixed64(value_buf_, (*flist_)[index_]->number); - EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); - return Slice(value_buf_, sizeof(value_buf_)); - } - virtual Status status() const { return Status::OK(); } - private: - const InternalKeyComparator icmp_; - const std::vector* const flist_; - uint32_t index_; - - // Backing store for value(). Holds the file number and size. - mutable char value_buf_[16]; -}; - -static Iterator* GetFileIterator(void* arg, - const ReadOptions& options, - const Slice& file_value) { - TableCache* cache = reinterpret_cast(arg); - if (file_value.size() != 16) { - return NewErrorIterator( - Status::Corruption("FileReader invoked with unexpected value")); - } else { - return cache->NewIterator(options, - DecodeFixed64(file_value.data()), - DecodeFixed64(file_value.data() + 8)); - } -} - -Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, - int level) const { - return NewTwoLevelIterator( - new LevelFileNumIterator(this, &files_[level]), - &GetFileIterator, vset_->table_cache_, options); -} - -void Version::AddIterators(const ReadOptions& options, - std::vector* iters) { - // Merge all level zero files together since they may overlap - for (size_t i = 0; i < files_[0].size(); i++) { - iters->push_back( - vset_->table_cache_->NewIterator( - options, files_[0][i]->number, files_[0][i]->file_size)); - } - - // For levels > 0, we can use a concatenating iterator that sequentially - // walks through the non-overlapping files in the level, opening them - // lazily. - for (int level = 1; level < config::kNumLevels; level++) { - if (!files_[level].empty()) { - iters->push_back(NewConcatenatingIterator(options, level)); - } - } -} - -void Version::Ref() { - ++refs_; -} - -void Version::Unref() { - assert(refs_ >= 1); - --refs_; - if (refs_ == 0) { - vset_->MaybeDeleteOldVersions(); - // TODO: try to delete obsolete files - } -} - -std::string Version::DebugString() const { - std::string r; - for (int level = 0; level < config::kNumLevels; level++) { - // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g'] - r.append("level "); - AppendNumberTo(&r, level); - r.push_back(':'); - const std::vector& files = files_[level]; - for (size_t i = 0; i < files.size(); i++) { - r.push_back(' '); - AppendNumberTo(&r, files[i]->number); - r.push_back(':'); - AppendNumberTo(&r, files[i]->file_size); - r.append("['"); - AppendEscapedStringTo(&r, files[i]->smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, files[i]->largest.Encode()); - r.append("']"); - } - r.push_back('\n'); - } - return r; -} - -// A helper class so we can efficiently apply a whole sequence -// of edits to a particular state without creating intermediate -// Versions that contain full copies of the intermediate state. -class VersionSet::Builder { - private: - typedef std::map FileMap; - VersionSet* vset_; - FileMap files_[config::kNumLevels]; - - public: - // Initialize a builder with the files from *base and other info from *vset - Builder(VersionSet* vset, Version* base) - : vset_(vset) { - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = base->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - FileMetaData* f = files[i]; - f->refs++; - files_[level].insert(std::make_pair(f->number, f)); - } - } - } - - ~Builder() { - for (int level = 0; level < config::kNumLevels; level++) { - const FileMap& fmap = files_[level]; - for (FileMap::const_iterator iter = fmap.begin(); - iter != fmap.end(); - ++iter) { - FileMetaData* f = iter->second; - f->refs--; - if (f->refs <= 0) { - delete f; - } - } - } - } - - // Apply all of the edits in *edit to the current state. - void Apply(VersionEdit* edit) { - // Update compaction pointers - for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { - const int level = edit->compact_pointers_[i].first; - vset_->compact_pointer_[level] = - edit->compact_pointers_[i].second.Encode().ToString(); - } - - // Delete files - const VersionEdit::DeletedFileSet& del = edit->deleted_files_; - for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); - iter != del.end(); - ++iter) { - const int level = iter->first; - const uint64_t number = iter->second; - FileMap::iterator fiter = files_[level].find(number); - assert(fiter != files_[level].end()); // Sanity check for debug mode - if (fiter != files_[level].end()) { - FileMetaData* f = fiter->second; - f->refs--; - if (f->refs <= 0) { - delete f; - } - files_[level].erase(fiter); - } - } - - // Add new files - for (size_t i = 0; i < edit->new_files_.size(); i++) { - const int level = edit->new_files_[i].first; - FileMetaData* f = new FileMetaData(edit->new_files_[i].second); - f->refs = 1; - assert(files_[level].count(f->number) == 0); - files_[level].insert(std::make_pair(f->number, f)); - } - } - - // Save the current state in *v. - void SaveTo(Version* v) { - for (int level = 0; level < config::kNumLevels; level++) { - const FileMap& fmap = files_[level]; - for (FileMap::const_iterator iter = fmap.begin(); - iter != fmap.end(); - ++iter) { - FileMetaData* f = iter->second; - f->refs++; - v->files_[level].push_back(f); - } - } - } -}; - -VersionSet::VersionSet(const std::string& dbname, - const Options* options, - TableCache* table_cache, - const InternalKeyComparator* cmp) - : env_(options->env), - dbname_(dbname), - options_(options), - table_cache_(table_cache), - icmp_(*cmp), - next_file_number_(2), - manifest_file_number_(0), // Filled by Recover() - last_sequence_(0), - log_number_(0), - prev_log_number_(0), - descriptor_file_(NULL), - descriptor_log_(NULL), - current_(new Version(this)), - oldest_(current_) { -} - -VersionSet::~VersionSet() { - for (Version* v = oldest_; v != NULL; ) { - Version* next = v->next_; - assert(v->refs_ == 0); - delete v; - v = next; - } - delete descriptor_log_; - delete descriptor_file_; -} - -Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { - if (edit->has_log_number_) { - assert(edit->log_number_ >= log_number_); - assert(edit->log_number_ < next_file_number_); - } else { - edit->SetLogNumber(log_number_); - } - - if (!edit->has_prev_log_number_) { - edit->SetPrevLogNumber(prev_log_number_); - } - - edit->SetNextFile(next_file_number_); - edit->SetLastSequence(last_sequence_); - - Version* v = new Version(this); - { - Builder builder(this, current_); - builder.Apply(edit); - builder.SaveTo(v); - } - - std::string new_manifest_file; - Status s = Finalize(v); - - // Initialize new descriptor log file if necessary by creating - // a temporary file that contains a snapshot of the current version. - if (s.ok()) { - if (descriptor_log_ == NULL) { - assert(descriptor_file_ == NULL); - new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); - edit->SetNextFile(next_file_number_); - s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); - if (s.ok()) { - descriptor_log_ = new log::Writer(descriptor_file_); - s = WriteSnapshot(descriptor_log_); - } - } - } - - // Write new record to MANIFEST log - if (s.ok()) { - std::string record; - edit->EncodeTo(&record); - s = descriptor_log_->AddRecord(record); - if (s.ok()) { - s = descriptor_file_->Sync(); - } - } - - // If we just created a new descriptor file, install it by writing a - // new CURRENT file that points to it. - if (s.ok() && !new_manifest_file.empty()) { - s = SetCurrentFile(env_, dbname_, manifest_file_number_); - } - - // Install the new version - if (s.ok()) { - assert(current_->next_ == NULL); - assert(current_->cleanup_mem_ == NULL); - current_->cleanup_mem_ = cleanup_mem; - v->next_ = NULL; - current_->next_ = v; - current_ = v; - log_number_ = edit->log_number_; - prev_log_number_ = edit->prev_log_number_; - } else { - delete v; - if (!new_manifest_file.empty()) { - delete descriptor_log_; - delete descriptor_file_; - descriptor_log_ = NULL; - descriptor_file_ = NULL; - env_->DeleteFile(new_manifest_file); - } - } - - return s; -} - -Status VersionSet::Recover() { - struct LogReporter : public log::Reader::Reporter { - Status* status; - virtual void Corruption(size_t bytes, const Status& s) { - if (this->status->ok()) *this->status = s; - } - }; - - // Read "CURRENT" file, which contains a pointer to the current manifest file - std::string current; - Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); - if (!s.ok()) { - return s; - } - if (current.empty() || current[current.size()-1] != '\n') { - return Status::Corruption("CURRENT file does not end with newline"); - } - current.resize(current.size() - 1); - - std::string dscname = dbname_ + "/" + current; - SequentialFile* file; - s = env_->NewSequentialFile(dscname, &file); - if (!s.ok()) { - return s; - } - - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t prev_log_number = 0; - Builder builder(this, current_); - - { - LogReporter reporter; - reporter.status = &s; - log::Reader reader(file, &reporter, true/*checksum*/); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (s.ok()) { - if (edit.has_comparator_ && - edit.comparator_ != icmp_.user_comparator()->Name()) { - s = Status::InvalidArgument( - edit.comparator_ + "does not match existing comparator ", - icmp_.user_comparator()->Name()); - } - } - - if (s.ok()) { - builder.Apply(&edit); - } - - if (edit.has_log_number_) { - log_number = edit.log_number_; - have_log_number = true; - } - - if (edit.has_prev_log_number_) { - prev_log_number = edit.prev_log_number_; - have_prev_log_number = true; - } - - if (edit.has_next_file_number_) { - next_file = edit.next_file_number_; - have_next_file = true; - } - - if (edit.has_last_sequence_) { - last_sequence = edit.last_sequence_; - have_last_sequence = true; - } - } - } - delete file; - file = NULL; - - if (s.ok()) { - if (!have_next_file) { - s = Status::Corruption("no meta-nextfile entry in descriptor"); - } else if (!have_log_number) { - s = Status::Corruption("no meta-lognumber entry in descriptor"); - } else if (!have_last_sequence) { - s = Status::Corruption("no last-sequence-number entry in descriptor"); - } - - if (!have_prev_log_number) { - prev_log_number = 0; - } - } - - if (s.ok()) { - Version* v = new Version(this); - builder.SaveTo(v); - s = Finalize(v); - if (!s.ok()) { - delete v; - } else { - // Install recovered version - v->next_ = NULL; - current_->next_ = v; - current_ = v; - manifest_file_number_ = next_file; - next_file_number_ = next_file + 1; - last_sequence_ = last_sequence; - log_number_ = log_number; - prev_log_number_ = prev_log_number; - } - } - - return s; -} - -static int64_t TotalFileSize(const std::vector& files) { - int64_t sum = 0; - for (size_t i = 0; i < files.size(); i++) { - sum += files[i]->file_size; - } - return sum; -} - -Status VersionSet::Finalize(Version* v) { - // Precomputed best level for next compaction - int best_level = -1; - double best_score = -1; - - Status s; - for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { - s = SortLevel(v, level); - - double score; - if (level == 0) { - // We treat level-0 specially by bounding the number of files - // instead of number of bytes for two reasons: - // - // (1) With larger write-buffer sizes, it is nice not to do too - // many level-0 compactions. - // - // (2) The files in level-0 are merged on every read and - // therefore we wish to avoid too many files when the individual - // file size is small (perhaps because of a small write-buffer - // setting, or very high compression ratios, or lots of - // overwrites/deletions). - score = v->files_[level].size() / 4.0; - } else { - // Compute the ratio of current size to size limit. - const uint64_t level_bytes = TotalFileSize(v->files_[level]); - score = static_cast(level_bytes) / MaxBytesForLevel(level); - } - - if (score > best_score) { - best_level = level; - best_score = score; - } - } - - v->compaction_level_ = best_level; - v->compaction_score_ = best_score; - return s; -} - -Status VersionSet::WriteSnapshot(log::Writer* log) { - // TODO: Break up into multiple records to reduce memory usage on recovery? - - // Save metadata - VersionEdit edit; - edit.SetComparatorName(icmp_.user_comparator()->Name()); - - // Save compaction pointers - for (int level = 0; level < config::kNumLevels; level++) { - if (!compact_pointer_[level].empty()) { - InternalKey key; - key.DecodeFrom(compact_pointer_[level]); - edit.SetCompactPointer(level, key); - } - } - - // Save files - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = current_->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - const FileMetaData* f = files[i]; - edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); - } - } - - std::string record; - edit.EncodeTo(&record); - return log->AddRecord(record); -} - -// Helper to sort by tables_[file_number].smallest -struct VersionSet::BySmallestKey { - const InternalKeyComparator* internal_comparator; - - bool operator()(FileMetaData* f1, FileMetaData* f2) const { - return internal_comparator->Compare(f1->smallest, f2->smallest) < 0; - } -}; - -Status VersionSet::SortLevel(Version* v, uint64_t level) { - Status result; - BySmallestKey cmp; - cmp.internal_comparator = &icmp_; - std::sort(v->files_[level].begin(), v->files_[level].end(), cmp); - - if (result.ok() && level > 0) { - // There should be no overlap - for (size_t i = 1; i < v->files_[level].size(); i++) { - const InternalKey& prev_end = v->files_[level][i-1]->largest; - const InternalKey& this_begin = v->files_[level][i]->smallest; - if (icmp_.Compare(prev_end, this_begin) >= 0) { - result = Status::Corruption( - "overlapping ranges in same level", - (EscapeString(prev_end.Encode()) + " vs. " + - EscapeString(this_begin.Encode()))); - break; - } - } - } - return result; -} - -int VersionSet::NumLevelFiles(int level) const { - assert(level >= 0); - assert(level < config::kNumLevels); - return current_->files_[level].size(); -} - -uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { - uint64_t result = 0; - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = v->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - if (icmp_.Compare(files[i]->largest, ikey) <= 0) { - // Entire file is before "ikey", so just add the file size - result += files[i]->file_size; - } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { - // Entire file is after "ikey", so ignore - if (level > 0) { - // Files other than level 0 are sorted by meta->smallest, so - // no further files in this level will contain data for - // "ikey". - break; - } - } else { - // "ikey" falls in the range for this table. Add the - // approximate offset of "ikey" within the table. - Table* tableptr; - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); - if (tableptr != NULL) { - result += tableptr->ApproximateOffsetOf(ikey.Encode()); - } - delete iter; - } - } - } - return result; -} - -void VersionSet::MaybeDeleteOldVersions() { - // Note: it is important to delete versions in order since a newer - // version with zero refs may be holding a pointer to a memtable - // that is used by somebody who has a ref on an older version. - while (oldest_ != current_ && oldest_->refs_ == 0) { - Version* next = oldest_->next_; - delete oldest_; - oldest_ = next; - } -} - -void VersionSet::AddLiveFiles(std::set* live) { - for (Version* v = oldest_; v != NULL; v = v->next_) { - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = v->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - live->insert(files[i]->number); - } - } - } -} - -int64_t VersionSet::NumLevelBytes(int level) const { - assert(level >= 0); - assert(level < config::kNumLevels); - return TotalFileSize(current_->files_[level]); -} - -int64_t VersionSet::MaxNextLevelOverlappingBytes() { - int64_t result = 0; - std::vector overlaps; - for (int level = 0; level < config::kNumLevels - 1; level++) { - for (size_t i = 0; i < current_->files_[level].size(); i++) { - const FileMetaData* f = current_->files_[level][i]; - GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); - const int64_t sum = TotalFileSize(overlaps); - if (sum > result) { - result = sum; - } - } - } - return result; -} - -// Store in "*inputs" all files in "level" that overlap [begin,end] -void VersionSet::GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs) { - inputs->clear(); - Slice user_begin = begin.user_key(); - Slice user_end = end.user_key(); - const Comparator* user_cmp = icmp_.user_comparator(); - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || - user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { - // Either completely before or after range; skip it - } else { - inputs->push_back(f); - } - } -} - -// Stores the minimal range that covers all entries in inputs in -// *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest) { - assert(!inputs.empty()); - smallest->Clear(); - largest->Clear(); - for (size_t i = 0; i < inputs.size(); i++) { - FileMetaData* f = inputs[i]; - if (i == 0) { - *smallest = f->smallest; - *largest = f->largest; - } else { - if (icmp_.Compare(f->smallest, *smallest) < 0) { - *smallest = f->smallest; - } - if (icmp_.Compare(f->largest, *largest) > 0) { - *largest = f->largest; - } - } - } -} - -// Stores the minimal range that covers all entries in inputs1 and inputs2 -// in *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange2(const std::vector& inputs1, - const std::vector& inputs2, - InternalKey* smallest, - InternalKey* largest) { - std::vector all = inputs1; - all.insert(all.end(), inputs2.begin(), inputs2.end()); - GetRange(all, smallest, largest); -} - -Iterator* VersionSet::MakeInputIterator(Compaction* c) { - ReadOptions options; - options.verify_checksums = options_->paranoid_checks; - options.fill_cache = false; - - // Level-0 files have to be merged together. For other levels, - // we will make a concatenating iterator per level. - // TODO(opt): use concatenating iterator for level-0 if there is no overlap - const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); - Iterator** list = new Iterator*[space]; - int num = 0; - for (int which = 0; which < 2; which++) { - if (!c->inputs_[which].empty()) { - if (c->level() + which == 0) { - const std::vector& files = c->inputs_[which]; - for (size_t i = 0; i < files.size(); i++) { - list[num++] = table_cache_->NewIterator( - options, files[i]->number, files[i]->file_size); - } - } else { - // Create concatenating iterator for the files from this level - list[num++] = NewTwoLevelIterator( - new Version::LevelFileNumIterator( - c->input_version_, &c->inputs_[which]), - &GetFileIterator, table_cache_, options); - } - } - } - assert(num <= space); - Iterator* result = NewMergingIterator(&icmp_, list, num); - delete[] list; - return result; -} - -Compaction* VersionSet::PickCompaction() { - if (!NeedsCompaction()) { - return NULL; - } - const int level = current_->compaction_level_; - assert(level >= 0); - assert(level+1 < config::kNumLevels); - - Compaction* c = new Compaction(level); - c->input_version_ = current_; - c->input_version_->Ref(); - - // Pick the first file that comes after compact_pointer_[level] - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (compact_pointer_[level].empty() || - icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { - c->inputs_[0].push_back(f); - break; - } - } - if (c->inputs_[0].empty()) { - // Wrap-around to the beginning of the key space - c->inputs_[0].push_back(current_->files_[level][0]); - } - - // Files in level 0 may overlap each other, so pick up all overlapping ones - if (level == 0) { - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - // Note that the next call will discard the file we placed in - // c->inputs_[0] earlier and replace it with an overlapping set - // which will include the picked file. - GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); - assert(!c->inputs_[0].empty()); - } - - SetupOtherInputs(c); - - return c; -} - -void VersionSet::SetupOtherInputs(Compaction* c) { - const int level = c->level(); - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - - GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); - - // Get entire range covered by compaction - InternalKey all_start, all_limit; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - - // See if we can grow the number of inputs in "level" without - // changing the number of "level+1" files we pick up. - if (!c->inputs_[1].empty()) { - std::vector expanded0; - GetOverlappingInputs(level, all_start, all_limit, &expanded0); - if (expanded0.size() > c->inputs_[0].size()) { - InternalKey new_start, new_limit; - GetRange(expanded0, &new_start, &new_limit); - std::vector expanded1; - GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); - if (expanded1.size() == c->inputs_[1].size()) { - Log(env_, options_->info_log, - "Expanding@%d %d+%d to %d+%d\n", - level, - int(c->inputs_[0].size()), - int(c->inputs_[1].size()), - int(expanded0.size()), - int(expanded1.size())); - smallest = new_start; - largest = new_limit; - c->inputs_[0] = expanded0; - c->inputs_[1] = expanded1; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - } - } - } - - // Compute the set of grandparent files that overlap this compaction - // (parent == level+1; grandparent == level+2) - if (level + 2 < config::kNumLevels) { - GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); - } - - if (false) { - Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", - level, - EscapeString(smallest.Encode()).c_str(), - EscapeString(largest.Encode()).c_str()); - } - - // Update the place where we will do the next compaction for this level. - // We update this immediately instead of waiting for the VersionEdit - // to be applied so that if the compaction fails, we will try a different - // key range next time. - compact_pointer_[level] = largest.Encode().ToString(); - c->edit_.SetCompactPointer(level, largest); -} - -Compaction* VersionSet::CompactRange( - int level, - const InternalKey& begin, - const InternalKey& end) { - std::vector inputs; - GetOverlappingInputs(level, begin, end, &inputs); - if (inputs.empty()) { - return NULL; - } - - Compaction* c = new Compaction(level); - c->input_version_ = current_; - c->input_version_->Ref(); - c->inputs_[0] = inputs; - SetupOtherInputs(c); - return c; -} - -Compaction::Compaction(int level) - : level_(level), - max_output_file_size_(MaxFileSizeForLevel(level)), - input_version_(NULL), - grandparent_index_(0), - seen_key_(false), - overlapped_bytes_(0) { - for (int i = 0; i < config::kNumLevels; i++) { - level_ptrs_[i] = 0; - } -} - -Compaction::~Compaction() { - if (input_version_ != NULL) { - input_version_->Unref(); - } -} - -bool Compaction::IsTrivialMove() const { - // Avoid a move if there is lots of overlapping grandparent data. - // Otherwise, the move could create a parent file that will require - // a very expensive merge later on. - return (num_input_files(0) == 1 && - num_input_files(1) == 0 && - TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes); -} - -void Compaction::AddInputDeletions(VersionEdit* edit) { - for (int which = 0; which < 2; which++) { - for (size_t i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level_ + which, inputs_[which][i]->number); - } - } -} - -bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { - const std::vector& files = input_version_->files_[lvl]; - for (; level_ptrs_[lvl] < files.size(); ) { - FileMetaData* f = files[level_ptrs_[lvl]]; - if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { - // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { - // Key falls in this file's range, so definitely not base level - return false; - } - break; - } - level_ptrs_[lvl]++; - } - } - return true; -} - -bool Compaction::ShouldStopBefore(const InternalKey& key) { - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; - } - grandparent_index_++; - } - seen_key_ = true; - - if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; - } else { - return false; - } -} - -void Compaction::ReleaseInputs() { - if (input_version_ != NULL) { - input_version_->Unref(); - input_version_ = NULL; - } -} - -} diff --git a/leveldb/db/version_set.h b/leveldb/db/version_set.h deleted file mode 100644 index e377513..0000000 --- a/leveldb/db/version_set.h +++ /dev/null @@ -1,308 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// The representation of a DBImpl consists of a set of Versions. The -// newest version is called "current". Older versions may be kept -// around to provide a consistent view to live iterators. -// -// Each Version keeps track of a set of Table files per level. The -// entire set of versions is maintained in a VersionSet. -// -// Version,VersionSet are thread-compatible, but require external -// synchronization on all accesses. - -#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ -#define STORAGE_LEVELDB_DB_VERSION_SET_H_ - -#include -#include -#include -#include "db/dbformat.h" -#include "db/version_edit.h" -#include "port/port.h" - -namespace leveldb { - -namespace log { class Writer; } - -class Compaction; -class Iterator; -class MemTable; -class TableBuilder; -class TableCache; -class Version; -class VersionSet; -class WritableFile; - -class Version { - public: - // Append to *iters a sequence of iterators that will - // yield the contents of this Version when merged together. - // REQUIRES: This version has been saved (see VersionSet::SaveTo) - void AddIterators(const ReadOptions&, std::vector* iters); - - // Reference count management (so Versions do not disappear out from - // under live iterators) - void Ref(); - void Unref(); - - // Return a human readable string that describes this version's contents. - std::string DebugString() const; - - private: - friend class Compaction; - friend class VersionSet; - - class LevelFileNumIterator; - Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; - - VersionSet* vset_; // VersionSet to which this Version belongs - Version* next_; // Next version in linked list - int refs_; // Number of live refs to this version - MemTable* cleanup_mem_; // NULL, or table to delete when version dropped - - // List of files per level - std::vector files_[config::kNumLevels]; - - // Level that should be compacted next and its compaction score. - // Score < 1 means compaction is not strictly needed. These fields - // are initialized by Finalize(). - double compaction_score_; - int compaction_level_; - - explicit Version(VersionSet* vset) - : vset_(vset), next_(NULL), refs_(0), - cleanup_mem_(NULL), - compaction_score_(-1), - compaction_level_(-1) { - } - - ~Version(); - - // No copying allowed - Version(const Version&); - void operator=(const Version&); -}; - -class VersionSet { - public: - VersionSet(const std::string& dbname, - const Options* options, - TableCache* table_cache, - const InternalKeyComparator*); - ~VersionSet(); - - // Apply *edit to the current version to form a new descriptor that - // is both saved to persistent state and installed as the new - // current version. Iff Apply() returns OK, arrange to delete - // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed - // by older versions. - Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); - - // Recover the last saved descriptor from persistent storage. - Status Recover(); - - // Save current contents to *log - Status WriteSnapshot(log::Writer* log); - - // Return the current version. - Version* current() const { return current_; } - - // Return the current manifest file number - uint64_t ManifestFileNumber() const { return manifest_file_number_; } - - // Allocate and return a new file number - uint64_t NewFileNumber() { return next_file_number_++; } - - // Return the number of Table files at the specified level. - int NumLevelFiles(int level) const; - - // Return the combined file size of all files at the specified level. - int64_t NumLevelBytes(int level) const; - - // Return the last sequence number. - uint64_t LastSequence() const { return last_sequence_; } - - // Set the last sequence number to s. - void SetLastSequence(uint64_t s) { - assert(s >= last_sequence_); - last_sequence_ = s; - } - - // Return the current log file number. - uint64_t LogNumber() const { return log_number_; } - - // Return the log file number for the log file that is currently - // being compacted, or zero if there is no such log file. - uint64_t PrevLogNumber() const { return prev_log_number_; } - - // Pick level and inputs for a new compaction. - // Returns NULL if there is no compaction to be done. - // Otherwise returns a pointer to a heap-allocated object that - // describes the compaction. Caller should delete the result. - Compaction* PickCompaction(); - - // Return a compaction object for compacting the range [begin,end] in - // the specified level. Returns NULL if there is nothing in that - // level that overlaps the specified range. Caller should delete - // the result. - Compaction* CompactRange( - int level, - const InternalKey& begin, - const InternalKey& end); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t MaxNextLevelOverlappingBytes(); - - // Create an iterator that reads over the compaction inputs for "*c". - // The caller should delete the iterator when no longer needed. - Iterator* MakeInputIterator(Compaction* c); - - // Returns true iff some level needs a compaction. - bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } - - // Add all files listed in any live version to *live. - // May also mutate some internal state. - void AddLiveFiles(std::set* live); - - // Return the approximate offset in the database of the data for - // "key" as of version "v". - uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); - - private: - class Builder; - - friend class Compaction; - friend class Version; - - Status Finalize(Version* v); - - // Delete any old versions that are no longer needed. - void MaybeDeleteOldVersions(); - - struct BySmallestKey; - Status SortLevel(Version* v, uint64_t level); - - void GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs); - - void GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest); - - void GetRange2(const std::vector& inputs1, - const std::vector& inputs2, - InternalKey* smallest, - InternalKey* largest); - - void SetupOtherInputs(Compaction* c); - - Env* const env_; - const std::string dbname_; - const Options* const options_; - TableCache* const table_cache_; - const InternalKeyComparator icmp_; - uint64_t next_file_number_; - uint64_t manifest_file_number_; - uint64_t last_sequence_; - uint64_t log_number_; - uint64_t prev_log_number_; // 0 or backing store for memtable being compacted - - // Opened lazily - WritableFile* descriptor_file_; - log::Writer* descriptor_log_; - - // Versions are kept in a singly linked list that is never empty - Version* current_; // Pointer to the last (newest) list entry - Version* oldest_; // Pointer to the first (oldest) list entry - - // Per-level key at which the next compaction at that level should start. - // Either an empty string, or a valid InternalKey. - std::string compact_pointer_[config::kNumLevels]; - - // No copying allowed - VersionSet(const VersionSet&); - void operator=(const VersionSet&); -}; - -// A Compaction encapsulates information about a compaction. -class Compaction { - public: - ~Compaction(); - - // Return the level that is being compacted. Inputs from "level" - // and "level+1" will be merged to produce a set of "level+1" files. - int level() const { return level_; } - - // Return the object that holds the edits to the descriptor done - // by this compaction. - VersionEdit* edit() { return &edit_; } - - // "which" must be either 0 or 1 - int num_input_files(int which) const { return inputs_[which].size(); } - - // Return the ith input file at "level()+which" ("which" must be 0 or 1). - FileMetaData* input(int which, int i) const { return inputs_[which][i]; } - - // Maximum size of files to build during this compaction. - uint64_t MaxOutputFileSize() const { return max_output_file_size_; } - - // Is this a trivial compaction that can be implemented by just - // moving a single input file to the next level (no merging or splitting) - bool IsTrivialMove() const; - - // Add all inputs to this compaction as delete operations to *edit. - void AddInputDeletions(VersionEdit* edit); - - // Returns true if the information we have available guarantees that - // the compaction is producing data in "level+1" for which no data exists - // in levels greater than "level+1". - bool IsBaseLevelForKey(const Slice& user_key); - - // Returns true iff we should stop building the current output - // before processing "key". - bool ShouldStopBefore(const InternalKey& key); - - // Release the input version for the compaction, once the compaction - // is successful. - void ReleaseInputs(); - - private: - friend class Version; - friend class VersionSet; - - explicit Compaction(int level); - - int level_; - uint64_t max_output_file_size_; - Version* input_version_; - VersionEdit edit_; - - // Each compaction reads inputs from "level_" and "level_+1" - std::vector inputs_[2]; // The two sets of inputs - - // State used to check for number of of overlapping grandparent files - // (parent == level_ + 1, grandparent == level_ + 2) - std::vector grandparents_; - size_t grandparent_index_; // Index in grandparent_starts_ - bool seen_key_; // Some output key has been seen - int64_t overlapped_bytes_; // Bytes of overlap between current output - // and grandparent files - - // State for implementing IsBaseLevelForKey - - // level_ptrs_ holds indices into input_version_->levels_: our state - // is that we are positioned at one of the file ranges for each - // higher level than the ones involved in this compaction (i.e. for - // all L >= level_ + 2). - size_t level_ptrs_[config::kNumLevels]; -}; - -} - -#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/leveldb/db/write_batch.cc b/leveldb/db/write_batch.cc deleted file mode 100644 index d561528..0000000 --- a/leveldb/db/write_batch.cc +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// WriteBatch::rep_ := -// sequence: fixed64 -// count: fixed32 -// data: record[count] -// record := -// kTypeValue varstring varstring | -// kTypeDeletion varstring -// varstring := -// len: varint32 -// data: uint8[len] - -#include "leveldb/write_batch.h" - -#include "leveldb/db.h" -#include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "util/coding.h" - -namespace leveldb { - -WriteBatch::WriteBatch() { - Clear(); -} - -WriteBatch::~WriteBatch() { } - -void WriteBatch::Clear() { - rep_.clear(); - rep_.resize(12); -} - -int WriteBatchInternal::Count(const WriteBatch* b) { - return DecodeFixed32(b->rep_.data() + 8); -} - -void WriteBatchInternal::SetCount(WriteBatch* b, int n) { - EncodeFixed32(&b->rep_[8], n); -} - -SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { - return SequenceNumber(DecodeFixed64(b->rep_.data())); -} - -void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { - EncodeFixed64(&b->rep_[0], seq); -} - -void WriteBatch::Put(const Slice& key, const Slice& value) { - WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeValue)); - PutLengthPrefixedSlice(&rep_, key); - PutLengthPrefixedSlice(&rep_, value); -} - -void WriteBatch::Delete(const Slice& key) { - WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeDeletion)); - PutLengthPrefixedSlice(&rep_, key); -} - -Status WriteBatchInternal::InsertInto(const WriteBatch* b, - MemTable* memtable) { - const int count = WriteBatchInternal::Count(b); - int found = 0; - Iterator it(*b); - for (; !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeDeletion: - memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); - break; - case kTypeValue: - memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); - break; - } - found++; - } - if (!it.status().ok()) { - return it.status(); - } else if (found != count) { - return Status::Corruption("wrong count in WriteBatch"); - } - return Status::OK(); -} - -void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { - assert(contents.size() >= 12); - b->rep_.assign(contents.data(), contents.size()); -} - -WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) - : input_(WriteBatchInternal::Contents(&batch)), - done_(false) { - if (input_.size() < 12) { - done_ = true; - } else { - seq_ = WriteBatchInternal::Sequence(&batch), - input_.remove_prefix(12); - GetNextEntry(); - } -} - -void WriteBatchInternal::Iterator::Next() { - assert(!done_); - seq_++; - GetNextEntry(); -} - -void WriteBatchInternal::Iterator::GetNextEntry() { - if (input_.empty()) { - done_ = true; - return; - } - char tag = input_[0]; - input_.remove_prefix(1); - switch (tag) { - case kTypeValue: - if (GetLengthPrefixedSlice(&input_, &key_) && - GetLengthPrefixedSlice(&input_, &value_)) { - op_ = static_cast(tag); - } else { - status_ = Status::Corruption("bad WriteBatch Put"); - done_ = true; - input_.clear(); - } - break; - case kTypeDeletion: - if (GetLengthPrefixedSlice(&input_, &key_)) { - op_ = kTypeDeletion; - } else { - status_ = Status::Corruption("bad WriteBatch Delete"); - done_ = true; - input_.clear(); - } - break; - default: - status_ = Status::Corruption("unknown WriteBatch tag"); - done_ = true; - input_.clear(); - break; - } -} - -} diff --git a/leveldb/db/write_batch_internal.h b/leveldb/db/write_batch_internal.h deleted file mode 100644 index ab0a823..0000000 --- a/leveldb/db/write_batch_internal.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ -#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ - -#include "leveldb/write_batch.h" - -namespace leveldb { - -// WriteBatchInternal provides static methods for manipulating a -// WriteBatch that we don't want in the public WriteBatch interface. -class WriteBatchInternal { - public: - // Return the number of entries in the batch. - static int Count(const WriteBatch* batch); - - // Set the count for the number of entries in the batch. - static void SetCount(WriteBatch* batch, int n); - - // Return the seqeunce number for the start of this batch. - static SequenceNumber Sequence(const WriteBatch* batch); - - // Store the specified number as the seqeunce number for the start of - // this batch. - static void SetSequence(WriteBatch* batch, SequenceNumber seq); - - static Slice Contents(const WriteBatch* batch) { - return Slice(batch->rep_); - } - - static size_t ByteSize(const WriteBatch* batch) { - return batch->rep_.size(); - } - - static void SetContents(WriteBatch* batch, const Slice& contents); - - static Status InsertInto(const WriteBatch* batch, MemTable* memtable); - - // Iterate over the contents of a write batch. - class Iterator { - public: - explicit Iterator(const WriteBatch& batch); - bool Done() const { return done_; } - void Next(); - ValueType op() const { return op_; } - const Slice& key() const { return key_; } - const Slice& value() const { return value_; } - SequenceNumber sequence_number() const { return seq_; } - Status status() const { return status_; } - - private: - void GetNextEntry(); - - Slice input_; - bool done_; - ValueType op_; - Slice key_; - Slice value_; - SequenceNumber seq_; - Status status_; - }; -}; - -} - - -#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/leveldb/db/write_batch_test.cc b/leveldb/db/write_batch_test.cc deleted file mode 100644 index 2bf1134..0000000 --- a/leveldb/db/write_batch_test.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "leveldb/env.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -static std::string PrintContents(WriteBatch* b) { - InternalKeyComparator cmp(BytewiseComparator()); - MemTable mem(cmp); - std::string state; - Status s = WriteBatchInternal::InsertInto(b, &mem); - Iterator* iter = mem.NewIterator(); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey ikey; - ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); - switch (ikey.type) { - case kTypeValue: - state.append("Put("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - break; - case kTypeDeletion: - state.append("Delete("); - state.append(ikey.user_key.ToString()); - state.append(")"); - break; - } - state.append("@"); - state.append(NumberToString(ikey.sequence)); - } - delete iter; - if (!s.ok()) { - state.append("ParseError()"); - } - return state; -} - -class WriteBatchTest { }; - -TEST(WriteBatchTest, Empty) { - WriteBatch batch; - ASSERT_EQ("", PrintContents(&batch)); - ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); -} - -TEST(WriteBatchTest, Multiple) { - WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - batch.Put(Slice("baz"), Slice("boo")); - WriteBatchInternal::SetSequence(&batch, 100); - ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); - ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); - ASSERT_EQ("Put(baz, boo)@102" - "Delete(box)@101" - "Put(foo, bar)@100", - PrintContents(&batch)); -} - -TEST(WriteBatchTest, Corruption) { - WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - WriteBatchInternal::SetSequence(&batch, 200); - Slice contents = WriteBatchInternal::Contents(&batch); - WriteBatchInternal::SetContents(&batch, - Slice(contents.data(),contents.size()-1)); - ASSERT_EQ("Put(foo, bar)@200" - "ParseError()", - PrintContents(&batch)); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/doc/doc.css b/leveldb/doc/doc.css deleted file mode 100644 index 700c564..0000000 --- a/leveldb/doc/doc.css +++ /dev/null @@ -1,89 +0,0 @@ -body { - margin-left: 0.5in; - margin-right: 0.5in; - background: white; - color: black; -} - -h1 { - margin-left: -0.2in; - font-size: 14pt; -} -h2 { - margin-left: -0in; - font-size: 12pt; -} -h3 { - margin-left: -0in; -} -h4 { - margin-left: -0in; -} -hr { - margin-left: -0in; -} - -/* Definition lists: definition term bold */ -dt { - font-weight: bold; -} - -address { - text-align: center; -} -code,samp,var { - color: blue; -} -kbd { - color: #600000; -} -div.note p { - float: right; - width: 3in; - margin-right: 0%; - padding: 1px; - border: 2px solid #6060a0; - background-color: #fffff0; -} - -ul { - margin-top: -0em; - margin-bottom: -0em; -} - -ol { - margin-top: -0em; - margin-bottom: -0em; -} - -UL.nobullets { - list-style-type: none; - list-style-image: none; - margin-left: -1em; -} - -p { - margin: 1em 0 1em 0; - padding: 0 0 0 0; -} - -pre { - line-height: 1.3em; - padding: 0.4em 0 0.8em 0; - margin: 0 0 0 0; - border: 0 0 0 0; - color: blue; -} - -.datatable { - margin-left: auto; - margin-right: auto; - margin-top: 2em; - margin-bottom: 2em; - border: 1px solid; -} - -.datatable td,th { - padding: 0 0.5em 0 0.5em; - text-align: right; -} diff --git a/leveldb/doc/impl.html b/leveldb/doc/impl.html deleted file mode 100644 index dd09fea..0000000 --- a/leveldb/doc/impl.html +++ /dev/null @@ -1,217 +0,0 @@ - - - - -Leveldb file layout and compactions - - - - -

Files

- -The implementation of leveldb is similar in spirit to the -representation of a single - -Bigtable tablet (section 5.3). -However the organization of the files that make up the representation -is somewhat different and is explained below. - -

-Each database is represented by a set of file stored in a directory. -There are several different types of files as documented below: -

-

Log files

-

-A log file (*.log) stores a sequence of recent updates. Each update -is appended to the current log file. When the log file reaches a -pre-determined size (approximately 1MB by default), it is converted -to a sorted table (see below) and a new log file is created for future -updates. -

-A copy of the current log file is kept in an in-memory structure (the -memtable). This copy is consulted on every read so that read -operations reflect all logged updates. -

-

Sorted tables

-

-A sorted table (*.sst) stores a sequence of entries sorted by key. -Each entry is either a value for the key, or a deletion marker for the -key. (Deletion markers are kept around to hide obsolete values -present in older sorted tables). -

-The set of sorted tables are organized into a sequence of levels. The -sorted table generated from a log file is placed in a special young -level (also called level-0). When the number of young files exceeds a -certain threshold (currently four), all of the young files are merged -together with all of the overlapping level-1 files to produce a -sequence of new level-1 files (we create a new level-1 file for every -2MB of data.) -

-Files in the young level may contain overlapping keys. However files -in other levels have distinct non-overlapping key ranges. Consider -level number L where L >= 1. When the combined size of files in -level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, -...), one file in level-L, and all of the overlapping files in -level-(L+1) are merged to form a set of new files for level-(L+1). -These merges have the effect of gradually migrating new updates from -the young level to the largest level using only bulk reads and writes -(i.e., minimizing expensive seeks). - -

Manifest

-

-A MANIFEST file lists the set of sorted tables that make up each -level, the corresponding key ranges, and other important metadata. -A new MANIFEST file (with a new number embedded in the file name) -is created whenever the database is reopened. The MANIFEST file is -formatted as a log, and changes made to the serving state (as files -are added or removed) are appended to this log. -

-

Current

-

-CURRENT is a simple text file that contains the name of the latest -MANIFEST file. -

-

Info logs

-

-Informational messages are printed to files named LOG and LOG.old. -

-

Others

-

-Other files used for miscellaneous purposes may also be present -(LOCK, *.dbtmp). - -

Level 0

-When the log file grows above a certain size (1MB by default): -
    -
  • Write the contents of the current memtable to an sstable -
  • Replace the current memtable by a brand new empty memtable -
  • Switch to a new log file -
  • Delete the old log file and the old memtable -
-Experimental measurements show that generating an sstable from a 1MB -log file takes ~12ms, which seems like an acceptable latency hiccup to -add infrequently to a log write. - -

-The new sstable is added to a special level-0 level. level-0 contains -a set of files (up to 4 by default). However unlike other levels, -these files do not cover disjoint ranges, but may overlap each other. - -

Compactions

- -

-When the size of level L exceeds its limit, we compact it in a -background thread. The compaction picks a file from level L and all -overlapping files from the next level L+1. Note that if a level-L -file overlaps only part of a level-(L+1) file, the entire file at -level-(L+1) is used as an input to the compaction and will be -discarded after the compaction. Aside: because level-0 is special -(files in it may overlap each other), we treat compactions from -level-0 to level-1 specially: a level-0 compaction may pick more than -one level-0 file in case some of these files overlap each other. - -

-A compaction merges the contents of the picked files to produce a -sequence of level-(L+1) files. We switch to producing a new -level-(L+1) file after the current output file has reached the target -file size (2MB). We also switch to a new output file when the key -range of the current output file has grown enough to overlap more then -ten level-(L+2) files. This last rule ensures that a later compaction -of a level-(L+1) file will not pick up too much data from level-(L+2). - -

-The old files are discarded and the new files are added to the serving -state. - -

-Compactions for a particular level rotate through the key space. In -more detail, for each level L, we remember the ending key of the last -compaction at level L. The next compaction for level L will pick the -first file that starts after this key (wrapping around to the -beginning of the key space if there is no such file). - -

-Compactions drop overwritten values. They also drop deletion markers -if there are no higher numbered levels that contain a file whose range -overlaps the current key. - -

Timing

- -Level-0 compactions will read up to four 1MB files from level-0, and -at worst all the level-1 files (10MB). I.e., we will read 14MB and -write 14MB. - -

-Other than the special level-0 compactions, we will pick one 2MB file -from level L. In the worst case, this will overlap ~ 12 files from -level L+1 (10 because level-(L+1) is ten times the size of level-L, -and another two at the boundaries since the file ranges at level-L -will usually not be aligned with the file ranges at level-L+1). The -compaction will therefore read 26MB and write 26MB. Assuming a disk -IO rate of 100MB/s (ballpark range for modern drives), the worst -compaction cost will be approximately 0.5 second. - -

-If we throttle the background writing to something small, say 10% of -the full 100MB/s speed, a compaction may take up to 5 seconds. If the -user is writing at 10MB/s, we might build up lots of level-0 files -(~50 to hold the 5*10MB). This may signficantly increase the cost of -reads due to the overhead of merging more files together on every -read. - -

-Solution 1: To reduce this problem, we might want to increase the log -switching threshold when the number of level-0 files is large. Though -the downside is that the larger this threshold, the larger the delay -that we will add to write latency when a write triggers a log switch. - -

-Solution 2: We might want to decrease write rate artificially when the -number of level-0 files goes up. - -

-Solution 3: We work on reducing the cost of very wide merges. -Perhaps most of the level-0 files will have their blocks sitting -uncompressed in the cache and we will only need to worry about the -O(N) complexity in the merging iterator. - -

Number of files

- -Instead of always making 2MB files, we could make larger files for -larger levels to reduce the total file count, though at the expense of -more bursty compactions. Alternatively, we could shard the set of -files into multiple directories. - -

-An experiment on an ext3 filesystem on Feb 04, 2011 shows -the following timings to do 100K file opens in directories with -varying number of files: - - - - - -
Files in directoryMicroseconds to open a file
10009
1000010
10000016
-So maybe even the sharding is not necessary on modern filesystems? - -

Recovery

- -
    -
  • Read CURRENT to find name of the latest committed MANIFEST -
  • Read the named MANIFEST file -
  • Clean up stale files -
  • We could open all sstables here, but it is probably better to be lazy... -
  • Convert log chunk to a new level-0 sstable -
  • Start directing new writes to a new log file with recovered sequence# -
- -

Garbage collection of files

- -DeleteObsoleteFiles() is called at the end of every -compaction and at the end of recovery. It finds the names of all -files in the database. It deletes all log files that are not the -current log file. It deletes all table files that are not referenced -from some level and are not the output of an active compaction. - - - diff --git a/leveldb/doc/index.html b/leveldb/doc/index.html deleted file mode 100644 index c2312b7..0000000 --- a/leveldb/doc/index.html +++ /dev/null @@ -1,498 +0,0 @@ - - - - -Leveldb - - - -

Leveldb

-
Jeff Dean, Sanjay Ghemawat
-

-The leveldb library provides a persistent key value store. Keys and -values are arbitrary byte arrays. The keys are ordered within the key -value store according to a user-specified comparator function. - -

-

Opening A Database

-

-A leveldb database has a name which corresponds to a file system -directory. All of the contents of database are stored in this -directory. The following example shows how to open a database, -creating it if necessary: -

-

-  #include <assert>
-  #include "leveldb/include/db.h"
-
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.create_if_missing = true;
-  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-  assert(status.ok());
-  ...
-
-If you want to raise an error if the database already exists, add -the following line before the leveldb::DB::Open call: -
-  options.error_if_exists = true;
-
-

Status

-

-You may have noticed the leveldb::Status type above. Values of this -type are returned by most functions in leveldb that may encounter an -error. You can check if such a result is ok, and also print an -associated error message: -

-

-   leveldb::Status s = ...;
-   if (!s.ok()) cerr << s.ToString() << endl;
-
-

Closing A Database

-

-When you are done with a database, just delete the database object. -Example: -

-

-  ... open the db as described above ...
-  ... do something with db ...
-  delete db;
-
-

Reads And Writes

-

-The database provides Put, Delete, and Get methods to -modify/query the database. For example, the following code -moves the value stored under key1 to key2. -

-  std::string value;
-  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
-  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
-
- -

Atomic Updates

-

-Note that if the process dies after the Put of key2 but before the -delete of key1, the same value may be left stored under multiple keys. -Such problems can be avoided by using the WriteBatch class to -atomically apply a set of updates: -

-

-  #include "leveldb/include/write_batch.h"
-  ...
-  std::string value;
-  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-  if (s.ok()) {
-    leveldb::WriteBatch batch;
-    batch.Delete(key1);
-    batch.Put(key2, value);
-    s = db->Write(leveldb::WriteOptions(), &batch);
-  }
-
-The WriteBatch holds a sequence of edits to be made to the database, -and these edits within the batch are applied in order. Note that we -called Delete before Put so that if key1 is identical to key2, -we do not end up erroneously dropping the value entirely. -

-Apart from its atomicity benefits, WriteBatch may also be used to -speed up bulk updates by placing lots of individual mutations into the -same batch. - -

Synchronous Writes

-By default, each write to leveldb is asynchronous: it -returns after pushing the write from the process into the operating -system. The transfer from operating system memory to the underlying -persistent storage happens asynchronously. The sync flag -can be turned on for a particular write to make the write operation -not return until the data being written has been pushed all the way to -persistent storage. (On Posix systems, this is implemented by calling -either fsync(...) or fdatasync(...) or -msync(..., MS_SYNC) before the write operation returns.) -
-  leveldb::WriteOptions write_options;
-  write_options.sync = true;
-  db->Put(write_options, ...);
-
-Asynchronous writes are often more than a thousand times as fast as -synchronous writes. The downside of asynchronous writes is that a -crash of the machine may cause the last few updates to be lost. Note -that a crash of just the writing process (i.e., not a reboot) will not -cause any loss since even when sync is false, an update -is pushed from the process memory into the operating system before it -is considered done. - -

-Asynchronous writes can often be used safely. For example, when -loading a large amount of data into the database you can handle lost -updates by restarting the bulk load after a crash. A hybrid scheme is -also possible where every Nth write is synchronous, and in the event -of a crash, the bulk load is restarted just after the last synchronous -write finished by the previous run. (The synchronous write can update -a marker that describes where to restart on a crash.) - -

-WriteBatch provides an alternative to asynchronous writes. -Multiple updates may be placed in the same WriteBatch and -applied together using a synchronous write (i.e., -write_options.sync is set to true). The extra cost of -the synchronous write will be amortized across all of the writes in -the batch. - -

-

Concurrency

-

-A database may only be opened by one process at a time. The leveldb -implementation acquires a lock from the operating system to prevent -misuse. Within a single process, the same leveldb::DB object may -be safely used by multiple concurrent threads. -

-

Iteration

-

-The following example demonstrates how to print all key,value pairs -in a database. -

-

-  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
-  }
-  assert(it->status().ok());  // Check for any errors found during the scan
-  delete it;
-
-The following variation shows how to process just the keys in the -range [start,limit): -

-

-  for (it->Seek(start);
-       it->Valid() && it->key().ToString() < limit;
-       it->Next()) {
-    ...
-  }
-
-You can also process entries in reverse order. (Caveat: reverse -iteration may be somewhat slower than forward iteration.) -

-

-  for (it->SeekToLast(); it->Valid(); it->Prev()) {
-    ...
-  }
-
-

Snapshots

-

-Snapshots provide consistent read-only views over the entire state of -the key-value store. ReadOptions::snapshot may be non-NULL to indicate -that a read should operate on a particular version of the DB state. -If ReadOptions::snapshot is NULL, the read will operate on an -implicit snapshot of the current state. -

-Snapshots typically are created by the DB::GetSnapshot() method: -

-

-  leveldb::ReadOptions options;
-  options.snapshot = db->GetSnapshot();
-  ... apply some updates to db ...
-  leveldb::Iterator* iter = db->NewIterator(options);
-  ... read using iter to view the state when the snapshot was created ...
-  delete iter;
-  db->ReleaseSnapshot(options.snapshot);
-
-Note that when a snapshot is no longer needed, it should be released -using the DB::ReleaseSnapshot interface. This allows the -implementation to get rid of state that was being maintained just to -support reading as of that snapshot. -

-A Write operation can also return a snapshot that -represents the state of the database just after applying a particular -set of updates: -

-

-  leveldb::Snapshot* snapshot;
-  leveldb::WriteOptions write_options;
-  write_options.post_write_snapshot = &snapshot;
-  leveldb::Status status = db->Write(write_options, ...);
-  ... perform other mutations to db ...
-
-  leveldb::ReadOptions read_options;
-  read_options.snapshot = snapshot;
-  leveldb::Iterator* iter = db->NewIterator(read_options);
-  ... read as of the state just after the Write call returned ...
-  delete iter;
-
-  db->ReleaseSnapshot(snapshot);
-
-

Slice

-

-The return value of the it->key() and it->value() calls above -are instances of the leveldb::Slice type. Slice is a simple -structure that contains a length and a pointer to an external byte -array. Returning a Slice is a cheaper alternative to returning a -std::string since we do not need to copy potentially large keys and -values. In addition, leveldb methods do not return null-terminated -C-style strings since leveldb keys and values are allowed to -contain '\0' bytes. -

-C++ strings and null-terminated C-style strings can be easily converted -to a Slice: -

-

-   leveldb::Slice s1 = "hello";
-
-   std::string str("world");
-   leveldb::Slice s2 = str;
-
-A Slice can be easily converted back to a C++ string: -
-   std::string str = s1.ToString();
-   assert(str == std::string("hello"));
-
-Be careful when using Slices since it is up to the caller to ensure that -the external byte array into which the Slice points remains live while -the Slice is in use. For example, the following is buggy: -

-

-   leveldb::Slice slice;
-   if (...) {
-     std::string str = ...;
-     slice = str;
-   }
-   Use(slice);
-
-When the if statement goes out of scope, str will be destroyed and the -backing storage for slice will disappear. -

-

Comparators

-

-The preceding examples used the default ordering function for key, -which orders bytes lexicographically. You can however supply a custom -comparator when opening a database. For example, suppose each -database key consists of two numbers and we should sort by the first -number, breaking ties by the second number. First, define a proper -subclass of leveldb::Comparator that expresses these rules: -

-

-  class TwoPartComparator : public leveldb::Comparator {
-   public:
-    // Three-way comparison function:
-    //   if a < b: negative result
-    //   if a > b: positive result
-    //   else: zero result
-    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
-      int a1, a2, b1, b2;
-      ParseKey(a, &a1, &a2);
-      ParseKey(b, &b1, &b2);
-      if (a1 < b1) return -1;
-      if (a1 > b1) return +1;
-      if (a2 < b2) return -1;
-      if (a2 > b2) return +1;
-      return 0;
-    }
-
-    // Ignore the following methods for now:
-    const char* Name() { return "TwoPartComparator"; }
-    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
-    void FindShortSuccessor(std::string*) const { }
-  };
-
-Now create a database using this custom comparator: -

-

-  TwoPartComparator cmp;
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.create_if_missing = true;
-  options.comparator = &cmp;
-  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-  ...
-
-

Backwards compatibility

-

-The result of the comparator's Name method is attached to the -database when it is created, and is checked on every subsequent -database open. If the name changes, the leveldb::DB::Open call will -fail. Therefore, change the name if and only if the new key format -and comparison function are incompatible with existing databases, and -it is ok to discard the contents of all existing databases. -

-You can however still gradually evolve your key format over time with -a little bit of pre-planning. For example, you could store a version -number at the end of each key (one byte should suffice for most uses). -When you wish to switch to a new key format (e.g., adding an optional -third part to the keys processed by TwoPartComparator), -(a) keep the same comparator name (b) increment the version number -for new keys (c) change the comparator function so it uses the -version numbers found in the keys to decide how to interpret them. -

-

Performance

-

-Performance can be tuned by changing the default values of the -types defined in leveldb/include/options.h. - -

-

Block size

-

-leveldb groups adjacent keys together into the same block and such a -block is the unit of transfer to and from persistent storage. The -default block size is approximately 4096 uncompressed bytes. -Applications that mostly do bulk scans over the contents of the -database may wish to increase this size. Applications that do a lot -of point reads of small values may wish to switch to a smaller block -size if performance measurements indicate an improvement. There isn't -much benefit in using blocks smaller than one kilobyte, or larger than -a few megabytes. Also note that compression will be more effective -with larger block sizes. -

-

Compression

-

-Each block is individually compressed before being written to -persistent storage. Compression is on by default since the default -compression method is very fast, and is automatically disabled for -uncompressible data. In rare cases, applications may want to disable -compression entirely, but should only do so if benchmarks show a -performance improvement: -

-

-  leveldb::Options options;
-  options.compression = leveldb::kNoCompression;
-  ... leveldb::DB::Open(options, name, ...) ....
-
-

Cache

-

-The contents of the database are stored in a set of files in the -filesystem and each file stores a sequence of compressed blocks. If -options.cache is non-NULL, it is used to cache frequently used -uncompressed block contents. -

-

-  #include "leveldb/include/cache.h"
-
-  leveldb::Options options;
-  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
-  leveldb::DB* db;
-  leveldb::DB::Open(options, name, &db);
-  ... use the db ...
-  delete db
-  delete options.cache;
-
-Note that the cache holds uncompressed data, and therefore it should -be sized according to application level data sizes, without any -reduction from compression. (Caching of compressed blocks is left to -the operating system buffer cache, or any custom Env -implementation provided by the client.) -

-When performing a bulk read, the application may wish to disable -caching so that the data processed by the bulk read does not end up -displacing most of the cached contents. A per-iterator option can be -used to achieve this: -

-

-  leveldb::ReadOptions options;
-  options.fill_cache = false;
-  leveldb::Iterator* it = db->NewIterator(options);
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    ...
-  }
-
-

Key Layout

-

-Note that the unit of disk transfer and caching is a block. Adjacent -keys (according to the database sort order) will usually be placed in -the same block. Therefore the application can improve its performance -by placing keys that are accessed together near each other and placing -infrequently used keys in a separate region of the key space. -

-For example, suppose we are implementing a simple file system on top -of leveldb. The types of entries we might wish to store are: -

-

-   filename -> permission-bits, length, list of file_block_ids
-   file_block_id -> data
-
-We might want to prefix filename keys with one letter (say '/') and the -file_block_id keys with a different letter (say '0') so that scans -over just the metadata do not force us to fetch and cache bulky file -contents. -

-

Checksums

-

-leveldb associates checksums with all data it stores in the file system. -There are two separate controls provided over how aggressively these -checksums are verified: -

-

    -
  • ReadOptions::verify_checksums may be set to true to force - checksum verification of all data that is read from the file system on - behalf of a particular read. By default, no such verification is - done. -

    -

  • Options::paranoid_checks may be set to true before opening a - database to make the database implementation raise an error as soon as - it detects an internal corruption. Depending on which portion of the - database has been corrupted, the error may be raised when the database - is opened, or later by another database operation. By default, - paranoid checking is off so that the database can be used even if - parts of its persistent storage have been corrupted. -

    - If a database is corrupted (perhaps it cannot be opened when - paranoid checking is turned on), the leveldb::RepairDB function - may be used to recover as much of the data as possible -

    -

-

Approximate Sizes

-

-The GetApproximateSizes method can used to get the approximate -number of bytes of file system space used by one or more key ranges. -

-

-   leveldb::Range ranges[2];
-   ranges[0] = leveldb::Range("a", "c");
-   ranges[1] = leveldb::Range("x", "z");
-   uint64_t sizes[2];
-   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
-
-The preceding call will set sizes[0] to the approximate number of -bytes of file system space used by the key range [a..c) and -sizes[1] to the approximate number of bytes used by the key range -[x..z). -

-

Environment

-

-All file operations (and other operating system calls) issued by the -leveldb implementation are routed through a leveldb::Env object. -Sophisticated clients may wish to provide their own Env -implementation to get better control. For example, an application may -introduce artificial delays in the file IO paths to limit the impact -of leveldb on other activities in the system. -

-

-  class SlowEnv : public leveldb::Env {
-    .. implementation of the Env interface ...
-  };
-
-  SlowEnv env;
-  leveldb::Options options;
-  options.env = &env;
-  Status s = leveldb::DB::Open(options, ...);
-
-

Porting

-

-leveldb may be ported to a new platform by providing platform -specific implementations of the types/methods/functions exported by -leveldb/port/port.h. See leveldb/port/port_example.h for more -details. -

-In addition, the new platform may need a new default leveldb::Env -implementation. See leveldb/util/env_posix.h for an example. - -

Other Information

- -

-Details about the leveldb implementation may be found in -the following documents: -

- - - diff --git a/leveldb/doc/log_format.txt b/leveldb/doc/log_format.txt deleted file mode 100644 index 3a0414b..0000000 --- a/leveldb/doc/log_format.txt +++ /dev/null @@ -1,75 +0,0 @@ -The log file contents are a sequence of 32KB blocks. The only -exception is that the tail of the file may contain a partial block. - -Each block consists of a sequence of records: - block := record* trailer? - record := - checksum: uint32 // crc32c of type and data[] - length: uint16 - type: uint8 // One of FULL, FIRST, MIDDLE, LAST - data: uint8[length] - -A record never starts within the last six bytes of a block (since it -won't fit). Any leftover bytes here form the trailer, which must -consist entirely of zero bytes and must be skipped by readers. - -Aside: if exactly seven bytes are left in the current block, and a new -non-zero length record is added, the writer must emit a FIRST record -(which contains zero bytes of user data) to fill up the trailing seven -bytes of the block and then emit all of the user data in subsequent -blocks. - -More types may be added in the future. Some Readers may skip record -types they do not understand, others may report that some data was -skipped. - -FULL == 1 -FIRST == 2 -MIDDLE == 3 -LAST == 4 - -The FULL record contains the contents of an entire user record. - -FIRST, MIDDLE, LAST are types used for user records that have been -split into multiple fragments (typically because of block boundaries). -FIRST is the type of the first fragment of a user record, LAST is the -type of the last fragment of a user record, and MID is the type of all -interior fragments of a user record. - -Example: consider a sequence of user records: - A: length 1000 - B: length 97270 - C: length 8000 -A will be stored as a FULL record in the first block. - -B will be split into three fragments: first fragment occupies the rest -of the first block, second fragment occupies the entirety of the -second block, and the third fragment occupies a prefix of the third -block. This will leave six bytes free in the third block, which will -be left empty as the trailer. - -C will be stored as a FULL record in the fourth block. - -=================== - -Some benefits over the recordio format: - -(1) We do not need any heuristics for resyncing - just go to next -block boundary and scan. If there is a corruption, skip to the next -block. As a side-benefit, we do not get confused when part of the -contents of one log file are embedded as a record inside another log -file. - -(2) Splitting at approximate boundaries (e.g., for mapreduce) is -simple: find the next block boundary and skip records until we -hit a FULL or FIRST record. - -(3) We do not need extra buffering for large records. - -Some downsides compared to recordio format: - -(1) No packing of tiny records. This could be fixed by adding a new -record type, so it is a shortcoming of the current implementation, -not necessarily the format. - -(2) No compression. Again, this could be fixed by adding new record types. diff --git a/leveldb/doc/table_format.txt b/leveldb/doc/table_format.txt deleted file mode 100644 index ad5aa4b..0000000 --- a/leveldb/doc/table_format.txt +++ /dev/null @@ -1,61 +0,0 @@ -File format -=========== - - - [data block 1] - [data block 2] - ... - [data block N] - [meta block 1] - ... - [meta block K] - [metaindex block] - [index block] - [Footer] (fixed size; starts at file_size - sizeof(Footer)) - - -The file contains internal pointers. Each such pointer is called -a BlockHandle and contains the following information: - offset: varint64 - size: varint64 - -(1) The sequence of key/value pairs in the file are stored in sorted -order and partitioned into a sequence of data blocks. These blocks -come one after another at the beginning of the file. Each data block -is formatted according to the code in block_builder.cc, and then -optionally compressed. - -(2) After the data blocks we store a bunch of meta blocks. The -supported meta block types are described below. More meta block types -may be added in the future. Each meta block is again formatted using -block_builder.cc and then optionally compressed. - -(3) A "metaindex" block. It contains one entry for every other meta -block where the key is the name of the meta block and the value is a -BlockHandle pointing to that meta block. - -(4) An "index" block. This block contains one entry per data block, -where the key is a string >= last key in that data block and before -the first key in the successive data block. The value is the -BlockHandle for the data block. - -(6) At the very end of the file is a fixed length footer that contains -the BlockHandle of the metaindex and index blocks as well as a magic number. - metaindex_handle: char[p]; // Block handle for metaindex - index_handle: char[q]; // Block handle for index - padding: char[40-p-q]; // 0 bytes to make fixed length - // (40==2*BlockHandle::kMaxEncodedLength) - magic: fixed64; // == 0xdb4775248b80fb57 - -"stats" Meta Block ------------------- - -This meta block contains a bunch of stats. The key is the name -of the statistic. The value contains the statistic. -TODO(postrelease): record following stats. - data size - index size - key size (uncompressed) - value size (uncompressed) - number of entries - number of data blocks diff --git a/leveldb/include/leveldb/cache.h b/leveldb/include/leveldb/cache.h deleted file mode 100644 index 79196d1..0000000 --- a/leveldb/include/leveldb/cache.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Cache is an interface that maps keys to values. It has internal -// synchronization and may be safely accessed concurrently from -// multiple threads. It may automatically evict entries to make room -// for new entries. Values have a specified charge against the cache -// capacity. For example, a cache where the values are variable -// length strings, may use the length of the string as the charge for -// the string. -// -// A builtin cache implementation with a least-recently-used eviction -// policy is provided. Clients may use their own implementations if -// they want something more sophisticated (like scan-resistance, a -// custom eviction policy, variable cache sizing, etc.) - -#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ -#define STORAGE_LEVELDB_INCLUDE_CACHE_H_ - -#include -#include "leveldb/slice.h" - -namespace leveldb { - -class Cache; - -// Create a new cache with a fixed size capacity. This implementation -// of Cache uses a least-recently-used eviction policy. -extern Cache* NewLRUCache(size_t capacity); - -class Cache { - public: - Cache() { } - - // Destroys all existing entries by calling the "deleter" - // function that was passed to the constructor. - virtual ~Cache(); - - // Opaque handle to an entry stored in the cache. - struct Handle { }; - - // Insert a mapping from key->value into the cache and assign it - // the specified charge against the total cache capacity. - // - // Returns a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. - // - // When the inserted entry is no longer needed, the key and - // value will be passed to "deleter". - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) = 0; - - // If the cache has no mapping for "key", returns NULL. - // - // Else return a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. - virtual Handle* Lookup(const Slice& key) = 0; - - // Release a mapping returned by a previous Lookup(). - // REQUIRES: handle must not have been released yet. - // REQUIRES: handle must have been returned by a method on *this. - virtual void Release(Handle* handle) = 0; - - // Return the value encapsulated in a handle returned by a - // successful Lookup(). - // REQUIRES: handle must not have been released yet. - // REQUIRES: handle must have been returned by a method on *this. - virtual void* Value(Handle* handle) = 0; - - // If the cache contains entry for key, erase it. Note that the - // underlying entry will be kept around until all existing handles - // to it have been released. - virtual void Erase(const Slice& key) = 0; - - // Return a new numeric id. May be used by multiple clients who are - // sharing the same cache to partition the key space. Typically the - // client will allocate a new id at startup and prepend the id to - // its cache keys. - virtual uint64_t NewId() = 0; - - private: - void LRU_Remove(Handle* e); - void LRU_Append(Handle* e); - void Unref(Handle* e); - - struct Rep; - Rep* rep_; - - // No copying allowed - Cache(const Cache&); - void operator=(const Cache&); -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/leveldb/include/leveldb/comparator.h b/leveldb/include/leveldb/comparator.h deleted file mode 100644 index 4e00e4d..0000000 --- a/leveldb/include/leveldb/comparator.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ -#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ - -#include - -namespace leveldb { - -class Slice; - -// A Comparator object provides a total order across slices that are -// used as keys in an sstable or a database. -class Comparator { - public: - virtual ~Comparator(); - - // Three-way comparison. Returns value: - // < 0 iff "a" < "b", - // == 0 iff "a" == "b", - // > 0 iff "a" > "b" - virtual int Compare(const Slice& a, const Slice& b) const = 0; - - // The name of the comparator. Used to check for comparator - // mismatches (i.e., a DB created with one comparator is - // accessed using a different comparator. - // - // The client of this package should switch to a new name whenever - // the comparator implementation changes in a way that will cause - // the relative ordering of any two keys to change. - // - // Names starting with "leveldb." are reserved and should not be used - // by any clients of this package. - virtual const char* Name() const = 0; - - // Advanced functions: these are used to reduce the space requirements - // for internal data structures like index blocks. - - // If *start < limit, changes *start to a short string in [start,limit). - // Simple comparator implementations may return with *start unchanged, - // i.e., an implementation of this method that does nothing is correct. - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const = 0; - - // Changes *key to a short string >= *key. - // Simple comparator implementations may return with *key unchanged, - // i.e., an implementation of this method that does nothing is correct. - virtual void FindShortSuccessor(std::string* key) const = 0; -}; - -// Return a builtin comparator that uses lexicographic byte-wise -// ordering. The result remains the property of this module and -// must not be deleted. -extern const Comparator* BytewiseComparator(); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/leveldb/include/leveldb/db.h b/leveldb/include/leveldb/db.h deleted file mode 100644 index f18ded3..0000000 --- a/leveldb/include/leveldb/db.h +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ -#define STORAGE_LEVELDB_INCLUDE_DB_H_ - -#include -#include -#include "leveldb/iterator.h" -#include "leveldb/options.h" - -namespace leveldb { - -static const int kMajorVersion = 1; -static const int kMinorVersion = 1; - -struct Options; -struct ReadOptions; -struct WriteOptions; - -class Snapshot; -class WriteBatch; - -// Some internal types. Clients should ignore. -class WriteBatchInternal; - -struct Range { - Slice start; - Slice limit; - - Range(const Slice& s, const Slice& l) : start(s), limit(l) { } -}; - -// A DB is a persistent ordered map from keys to values. -class DB { - public: - // Open the database with the specified "name". - // Stores a pointer to a heap-allocated database in *dbptr and returns - // OK on success. - // Stores NULL in *dbptr and returns a non-OK status on error. - // Caller should delete *dbptr when it is no longer needed. - static Status Open(const Options& options, - const std::string& name, - DB** dbptr); - - DB() { } - virtual ~DB(); - - // Set the database entry for "key" to "value". Returns OK on success, - // and a non-OK status on error. - // Note: consider setting options.sync = true. - virtual Status Put(const WriteOptions& options, - const Slice& key, - const Slice& value) = 0; - - // Remove the database entry (if any) for "key". Returns OK on - // success, and a non-OK status on error. It is not an error if "key" - // did not exist in the database. - // Note: consider setting options.sync = true. - virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; - - // Apply the specified updates to the database. - // Returns OK on success, non-OK on failure. - // Note: consider setting options.sync = true. - virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; - - // If the database contains an entry for "key" store the - // corresponding value in *value and return OK. - // - // If there is no entry for "key" leave *value unchanged and return - // a status for which Status::IsNotFound() returns true. - // - // May return some other Status on an error. - virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) = 0; - - // Return a heap-allocated iterator over the contents of the database. - // The result of NewIterator() is initially invalid (caller must - // call one of the Seek methods on the iterator before using it). - // - // Caller should delete the iterator when it is no longer needed. - // The returned iterator should be deleted before this db is deleted. - virtual Iterator* NewIterator(const ReadOptions& options) = 0; - - // Return a handle to the current DB state. Iterators created with - // this handle will all observe a stable snapshot of the current DB - // state. The caller must call ReleaseSnapshot(result) when the - // snapshot is no longer needed. - virtual const Snapshot* GetSnapshot() = 0; - - // Release a previously acquired snapshot. The caller must not - // use "snapshot" after this call. - virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; - - // DB implementations can export properties about their state - // via this method. If "property" is a valid property understood by this - // DB implementation, fills "*value" with its current value and returns - // true. Otherwise returns false. - // - // - // Valid property names include: - // - // "leveldb.num-files-at-level" - return the number of files at level , - // where is an ASCII representation of a level number (e.g. "0"). - // "leveldb.stats" - returns a multi-line string that describes statistics - // about the internal operation of the DB. - virtual bool GetProperty(const Slice& property, std::string* value) = 0; - - // For each i in [0,n-1], store in "sizes[i]", the approximate - // file system space used by keys in "[range[i].start .. range[i].limit)". - // - // Note that the returned sizes measure file system space usage, so - // if the user data compresses by a factor of ten, the returned - // sizes will be one-tenth the size of the corresponding user data size. - // - // The results may not include the sizes of recently written data. - virtual void GetApproximateSizes(const Range* range, int n, - uint64_t* sizes) = 0; - - // Possible extensions: - // (1) Add a method to compact a range of keys - - private: - // No copying allowed - DB(const DB&); - void operator=(const DB&); -}; - -// Destroy the contents of the specified database. -// Be very careful using this method. -Status DestroyDB(const std::string& name, const Options& options); - -// If a DB cannot be opened, you may attempt to call this method to -// resurrect as much of the contents of the database as possible. -// Some data may be lost, so be careful when calling this function -// on a database that contains important information. -Status RepairDB(const std::string& dbname, const Options& options); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/leveldb/include/leveldb/env.h b/leveldb/include/leveldb/env.h deleted file mode 100644 index 4b6e712..0000000 --- a/leveldb/include/leveldb/env.h +++ /dev/null @@ -1,290 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// An Env is an interface used by the leveldb implementation to access -// operating system functionality like the filesystem etc. Callers -// may wish to provide a custom Env object when opening a database to -// get fine gain control; e.g., to rate limit file system operations. - -#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ -#define STORAGE_LEVELDB_INCLUDE_ENV_H_ - -#include -#include -#include -#include -#include "leveldb/status.h" - -namespace leveldb { - -class FileLock; -class RandomAccessFile; -class SequentialFile; -class Slice; -class WritableFile; - -class Env { - public: - Env() { } - virtual ~Env(); - - // Return a default environment suitable for the current operating - // system. Sophisticated users may wish to provide their own Env - // implementation instead of relying on this default environment. - // - // The result of Default() belongs to leveldb and must never be deleted. - static Env* Default(); - - // Create a brand new sequentially-readable file with the specified name. - // On success, stores a pointer to the new file in *result and returns OK. - // On failure stores NULL in *result and returns non-OK. If the file does - // not exist, returns a non-OK status. - // - // The returned file will only be accessed by one thread at a time. - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) = 0; - - // Create a brand new random access read-only file with the - // specified name. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores NULL in *result and - // returns non-OK. If the file does not exist, returns a non-OK - // status. - // - // The returned file may be concurrently accessed by multiple threads. - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) = 0; - - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores NULL in *result and - // returns non-OK. - // - // The returned file will only be accessed by one thread at a time. - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) = 0; - - // Returns true iff the named file exists. - virtual bool FileExists(const std::string& fname) = 0; - - // Store in *result the names of the children of the specified directory. - // The names are relative to "dir". - // Original contents of *results are dropped. - virtual Status GetChildren(const std::string& dir, - std::vector* result) = 0; - - // Delete the named file. - virtual Status DeleteFile(const std::string& fname) = 0; - - // Create the specified directory. - virtual Status CreateDir(const std::string& dirname) = 0; - - // Delete the specified directory. - virtual Status DeleteDir(const std::string& dirname) = 0; - - // Store the size of fname in *file_size. - virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; - - // Rename file src to target. - virtual Status RenameFile(const std::string& src, - const std::string& target) = 0; - - // Lock the specified file. Used to prevent concurrent access to - // the same db by multiple processes. On failure, stores NULL in - // *lock and returns non-OK. - // - // On success, stores a pointer to the object that represents the - // acquired lock in *lock and returns OK. The caller should call - // UnlockFile(*lock) to release the lock. If the process exits, - // the lock will be automatically released. - // - // If somebody else already holds the lock, finishes immediately - // with a failure. I.e., this call does not wait for existing locks - // to go away. - // - // May create the named file if it does not already exist. - virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; - - // Release the lock acquired by a previous successful call to LockFile. - // REQUIRES: lock was returned by a successful LockFile() call - // REQUIRES: lock has not already been unlocked. - virtual Status UnlockFile(FileLock* lock) = 0; - - // Arrange to run "(*function)(arg)" once in a background thread. - // - // "function" may run in an unspecified thread. Multiple functions - // added to the same Env may run concurrently in different threads. - // I.e., the caller may not assume that background work items are - // serialized. - virtual void Schedule( - void (*function)(void* arg), - void* arg) = 0; - - // Start a new thread, invoking "function(arg)" within the new thread. - // When "function(arg)" returns, the thread will be destroyed. - virtual void StartThread(void (*function)(void* arg), void* arg) = 0; - - // *path is set to a temporary directory that can be used for testing. It may - // or many not have just been created. The directory may or may not differ - // between runs of the same process, but subsequent calls will return the - // same directory. - virtual Status GetTestDirectory(std::string* path) = 0; - - // Write an entry to the log file with the specified format. - virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; - - // Returns the number of micro-seconds since some fixed point in time. Only - // useful for computing deltas of time. - virtual uint64_t NowMicros() = 0; - - // Sleep/delay the thread for the perscribed number of micro-seconds. - virtual void SleepForMicroseconds(int micros) = 0; - - private: - // No copying allowed - Env(const Env&); - void operator=(const Env&); -}; - -// A file abstraction for reading sequentially through a file -class SequentialFile { - public: - SequentialFile() { } - virtual ~SequentialFile(); - - // Read up to "n" bytes from the file. "scratch[0..n-1]" may be - // written by this routine. Sets "*result" to the data that was - // read (including if fewer than "n" bytes were successfully read). - // If an error was encountered, returns a non-OK status. - // - // REQUIRES: External synchronization - virtual Status Read(size_t n, Slice* result, char* scratch) = 0; -}; - -// A file abstraction for randomly reading the contents of a file. -class RandomAccessFile { - public: - RandomAccessFile() { } - virtual ~RandomAccessFile(); - - // Read up to "n" bytes from the file starting at "offset". - // "scratch[0..n-1]" may be written by this routine. Sets "*result" - // to the data that was read (including if fewer than "n" bytes were - // successfully read). If an error was encountered, returns a - // non-OK status. - // - // Safe for concurrent use by multiple threads. - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const = 0; -}; - -// A file abstraction for sequential writing. The implementation -// must provide buffering since callers may append small fragments -// at a time to the file. -class WritableFile { - public: - WritableFile() { } - virtual ~WritableFile(); - - virtual Status Append(const Slice& data) = 0; - virtual Status Close() = 0; - virtual Status Flush() = 0; - virtual Status Sync() = 0; - - private: - // No copying allowed - WritableFile(const WritableFile&); - void operator=(const WritableFile&); -}; - -// Identifies a locked file. -class FileLock { - public: - FileLock() { } - virtual ~FileLock(); - private: - // No copying allowed - FileLock(const FileLock&); - void operator=(const FileLock&); -}; - -// Log the specified data to *info_log if info_log is non-NULL. -extern void Log(Env* env, WritableFile* info_log, const char* format, ...) -# if defined(__GNUC__) || defined(__clang__) - __attribute__((__format__ (__printf__, 3, 4))) -# endif - ; - -// A utility routine: write "data" to the named file. -extern Status WriteStringToFile(Env* env, const Slice& data, - const std::string& fname); - -// A utility routine: read contents of named file into *data -extern Status ReadFileToString(Env* env, const std::string& fname, - std::string* data); - -// An implementation of Env that forwards all calls to another Env. -// May be useful to clients who wish to override just part of the -// functionality of another Env. -class EnvWrapper : public Env { - public: - // Initialize an EnvWrapper that delegates all calls to *target - explicit EnvWrapper(Env* target) : target_(target) { } - virtual ~EnvWrapper(); - - // Return the target to which this Env forwards all calls - Env* target() const { return target_; } - - // The following text is boilerplate that forwards all methods to target() - Status NewSequentialFile(const std::string& f, SequentialFile** r) { - return target_->NewSequentialFile(f, r); - } - Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { - return target_->NewRandomAccessFile(f, r); - } - Status NewWritableFile(const std::string& f, WritableFile** r) { - return target_->NewWritableFile(f, r); - } - bool FileExists(const std::string& f) { return target_->FileExists(f); } - Status GetChildren(const std::string& dir, std::vector* r) { - return target_->GetChildren(dir, r); - } - Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } - Status CreateDir(const std::string& d) { return target_->CreateDir(d); } - Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } - Status GetFileSize(const std::string& f, uint64_t* s) { - return target_->GetFileSize(f, s); - } - Status RenameFile(const std::string& s, const std::string& t) { - return target_->RenameFile(s, t); - } - Status LockFile(const std::string& f, FileLock** l) { - return target_->LockFile(f, l); - } - Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } - void Schedule(void (*f)(void*), void* a) { - return target_->Schedule(f, a); - } - void StartThread(void (*f)(void*), void* a) { - return target_->StartThread(f, a); - } - virtual Status GetTestDirectory(std::string* path) { - return target_->GetTestDirectory(path); - } - virtual void Logv(WritableFile* log, const char* format, va_list ap) { - return target_->Logv(log, format, ap); - } - uint64_t NowMicros() { - return target_->NowMicros(); - } - void SleepForMicroseconds(int micros) { - target_->SleepForMicroseconds(micros); - } - private: - Env* target_; -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/leveldb/include/leveldb/iterator.h b/leveldb/include/leveldb/iterator.h deleted file mode 100644 index 1866fb5..0000000 --- a/leveldb/include/leveldb/iterator.h +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// An iterator yields a sequence of key/value pairs from a source. -// The following class defines the interface. Multiple implementations -// are provided by this library. In particular, iterators are provided -// to access the contents of a Table or a DB. - -#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ -#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ - -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class Iterator { - public: - Iterator(); - virtual ~Iterator(); - - // An iterator is either positioned at a key/value pair, or - // not valid. This method returns true iff the iterator is valid. - virtual bool Valid() const = 0; - - // Position at the first key in the source. The iterator is Valid() - // after this call iff the source is not empty. - virtual void SeekToFirst() = 0; - - // Position at the last key in the source. The iterator is - // Valid() after this call iff the source is not empty. - virtual void SeekToLast() = 0; - - // Position at the first key in the source that at or past target - // The iterator is Valid() after this call iff the source contains - // an entry that comes at or past target. - virtual void Seek(const Slice& target) = 0; - - // Moves to the next entry in the source. After this call, Valid() is - // true iff the iterator was not positioned at the last entry in the source. - // REQUIRES: Valid() - virtual void Next() = 0; - - // Moves to the previous entry in the source. After this call, Valid() is - // true iff the iterator was not positioned at the first entry in source. - // REQUIRES: Valid() - virtual void Prev() = 0; - - // Return the key for the current entry. The underlying storage for - // the returned slice is valid only until the next modification of - // the iterator. - // REQUIRES: Valid() - virtual Slice key() const = 0; - - // Return the value for the current entry. The underlying storage for - // the returned slice is valid only until the next modification of - // the iterator. - // REQUIRES: !AtEnd() && !AtStart() - virtual Slice value() const = 0; - - // If an error has occurred, return it. Else return an ok status. - virtual Status status() const = 0; - - // Clients are allowed to register function/arg1/arg2 triples that - // will be invoked when this iterator is destroyed. - // - // Note that unlike all of the preceding methods, this method is - // not abstract and therefore clients should not override it. - typedef void (*CleanupFunction)(void* arg1, void* arg2); - void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); - - private: - struct Cleanup { - CleanupFunction function; - void* arg1; - void* arg2; - Cleanup* next; - }; - Cleanup cleanup_; - - // No copying allowed - Iterator(const Iterator&); - void operator=(const Iterator&); -}; - -// Return an empty iterator (yields nothing). -extern Iterator* NewEmptyIterator(); - -// Return an empty iterator with the specified status. -extern Iterator* NewErrorIterator(const Status& status); - -} - -#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/leveldb/include/leveldb/options.h b/leveldb/include/leveldb/options.h deleted file mode 100644 index a94651f..0000000 --- a/leveldb/include/leveldb/options.h +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ -#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ - -#include - -namespace leveldb { - -class Cache; -class Comparator; -class Env; -class Snapshot; -class WritableFile; - -// DB contents are stored in a set of blocks, each of which holds a -// sequence of key,value pairs. Each block may be compressed before -// being stored in a file. The following enum describes which -// compression method (if any) is used to compress a block. -enum CompressionType { - // NOTE: do not change the values of existing entries, as these are - // part of the persistent format on disk. - kNoCompression = 0x0, - kSnappyCompression = 0x1, -}; - -// Options to control the behavior of a database (passed to DB::Open) -struct Options { - // ------------------- - // Parameters that affect behavior - - // Comparator used to define the order of keys in the table. - // Default: a comparator that uses lexicographic byte-wise ordering - // - // REQUIRES: The client must ensure that the comparator supplied - // here has the same name and orders keys *exactly* the same as the - // comparator provided to previous open calls on the same DB. - const Comparator* comparator; - - // If true, the database will be created if it is missing. - // Default: false - bool create_if_missing; - - // If true, an error is raised if the database already exists. - // Default: false - bool error_if_exists; - - // If true, the implementation will do aggressive checking of the - // data it is processing and will stop early if it detects any - // errors. This may have unforeseen ramifications: for example, a - // corruption of one DB entry may cause a large number of entries to - // become unreadable or for the entire DB to become unopenable. - // Default: false - bool paranoid_checks; - - // Use the specified object to interact with the environment, - // e.g. to read/write files, schedule background work, etc. - // Default: Env::Default() - Env* env; - - // Any internal progress/error information generated by the db will - // be to written to info_log if it is non-NULL, or to a file stored - // in the same directory as the DB contents if info_log is NULL. - // Default: NULL - WritableFile* info_log; - - // ------------------- - // Parameters that affect performance - - // Amount of data to build up in memory (backed by an unsorted log - // on disk) before converting to a sorted on-disk file. - // - // Larger values increase performance, especially during bulk loads. - // Up to two write buffers may be held in memory at the same time, - // so you may wish to adjust this parameter to control memory usage. - // - // Default: 4MB - size_t write_buffer_size; - - // Number of open files that can be used by the DB. You may need to - // increase this if your database has a large working set (budget - // one open file per 2MB of working set). - // - // Default: 1000 - int max_open_files; - - // Control over blocks (user data is stored in a set of blocks, and - // a block is the unit of reading from disk). - - // If non-NULL, use the specified cache for blocks. - // If NULL, leveldb will automatically create and use an 8MB internal cache. - // Default: NULL - Cache* block_cache; - - // Approximate size of user data packed per block. Note that the - // block size specified here corresponds to uncompressed data. The - // actual size of the unit read from disk may be smaller if - // compression is enabled. This parameter can be changed dynamically. - // - // Default: 4K - size_t block_size; - - // Number of keys between restart points for delta encoding of keys. - // This parameter can be changed dynamically. Most clients should - // leave this parameter alone. - // - // Default: 16 - int block_restart_interval; - - // Compress blocks using the specified compression algorithm. This - // parameter can be changed dynamically. - // - // Default: kSnappyCompression, which gives lightweight but fast - // compression. - // - // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: - // ~200-500MB/s compression - // ~400-800MB/s decompression - // Note that these speeds are significantly faster than most - // persistent storage speeds, and therefore it is typically never - // worth switching to kNoCompression. Even if the input data is - // incompressible, the kSnappyCompression implementation will - // efficiently detect that and will switch to uncompressed mode. - CompressionType compression; - - // Create an Options object with default values for all fields. - Options(); -}; - -// Options that control read operations -struct ReadOptions { - // If true, all data read from underlying storage will be - // verified against corresponding checksums. - // Default: false - bool verify_checksums; - - // Should the data read for this iteration be cached in memory? - // Callers may wish to set this field to false for bulk scans. - // Default: true - bool fill_cache; - - // If "snapshot" is non-NULL, read as of the supplied snapshot - // (which must belong to the DB that is being read and which must - // not have been released). If "snapshot" is NULL, use an impliicit - // snapshot of the state at the beginning of this read operation. - // Default: NULL - const Snapshot* snapshot; - - ReadOptions() - : verify_checksums(false), - fill_cache(true), - snapshot(NULL) { - } -}; - -// Options that control write operations -struct WriteOptions { - // If true, the write will be flushed from the operating system - // buffer cache (by calling WritableFile::Sync()) before the write - // is considered complete. If this flag is true, writes will be - // slower. - // - // If this flag is false, and the machine crashes, some recent - // writes may be lost. Note that if it is just the process that - // crashes (i.e., the machine does not reboot), no writes will be - // lost even if sync==false. - // - // In other words, a DB write with sync==false has similar - // crash semantics as the "write()" system call. A DB write - // with sync==true has similar crash semantics to a "write()" - // system call followed by "fsync()". - // - // Default: false - bool sync; - - // If "post_write_snapshot" is non-NULL, and the write succeeds, - // *post_write_snapshot will be modified to point to a snapshot of - // the DB state immediately after this write. The caller must call - // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the - // snapshot is no longer needed. - // - // If "post_write_snapshot" is non-NULL, and the write fails, - // *post_write_snapshot will be set to NULL. - // - // Default: NULL - const Snapshot** post_write_snapshot; - - WriteOptions() - : sync(false), - post_write_snapshot(NULL) { - } -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/leveldb/include/leveldb/slice.h b/leveldb/include/leveldb/slice.h deleted file mode 100644 index 62cb894..0000000 --- a/leveldb/include/leveldb/slice.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Slice is a simple structure containing a pointer into some external -// storage and a size. The user of a Slice must ensure that the slice -// is not used after the corresponding external storage has been -// deallocated. - -#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ -#define STORAGE_LEVELDB_INCLUDE_SLICE_H_ - -#include -#include -#include -#include - -namespace leveldb { - -class Slice { - public: - // Create an empty slice. - Slice() : data_(""), size_(0) { } - - // Create a slice that refers to data[0,n-1]. - Slice(const char* data, size_t n) : data_(data), size_(n) { } - - // Create a slice that refers to the contents of "s" - Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } - - // Create a slice that refers to s[0,strlen(s)-1] - Slice(const char* s) : data_(s), size_(strlen(s)) { } - - // Return a pointer to the beginning of the referenced data - const char* data() const { return data_; } - - // Return the length (in bytes) of the referenced data - size_t size() const { return size_; } - - // Return true iff the length of the referenced data is zero - bool empty() const { return size_ == 0; } - - // Return the ith byte in the referenced data. - // REQUIRES: n < size() - char operator[](size_t n) const { - assert(n < size()); - return data_[n]; - } - - // Change this slice to refer to an empty array - void clear() { data_ = ""; size_ = 0; } - - // Drop the first "n" bytes from this slice. - void remove_prefix(size_t n) { - assert(n <= size()); - data_ += n; - size_ -= n; - } - - // Return a string that contains the copy of the referenced data. - std::string ToString() const { return std::string(data_, size_); } - - // Three-way comparison. Returns value: - // < 0 iff "*this" < "b", - // == 0 iff "*this" == "b", - // > 0 iff "*this" > "b" - int compare(const Slice& b) const; - - // Return true iff "x" is a prefix of "*this" - bool starts_with(const Slice& x) const { - return ((size_ >= x.size_) && - (memcmp(data_, x.data_, x.size_) == 0)); - } - - private: - const char* data_; - size_t size_; - - // Intentionally copyable -}; - -inline bool operator==(const Slice& x, const Slice& y) { - return ((x.size() == y.size()) && - (memcmp(x.data(), y.data(), x.size()) == 0)); -} - -inline bool operator!=(const Slice& x, const Slice& y) { - return !(x == y); -} - -inline int Slice::compare(const Slice& b) const { - const int min_len = (size_ < b.size_) ? size_ : b.size_; - int r = memcmp(data_, b.data_, min_len); - if (r == 0) { - if (size_ < b.size_) r = -1; - else if (size_ > b.size_) r = +1; - } - return r; -} - -} - - -#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/leveldb/include/leveldb/status.h b/leveldb/include/leveldb/status.h deleted file mode 100644 index 47e3edf..0000000 --- a/leveldb/include/leveldb/status.h +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Status encapsulates the result of an operation. It may indicate success, -// or it may indicate an error with an associated error message. - -#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ -#define STORAGE_LEVELDB_INCLUDE_STATUS_H_ - -#include -#include -#include "leveldb/slice.h" - -namespace leveldb { - -class Status { - public: - // Create a success status. - Status() : state_(NULL) { } - ~Status() { delete state_; } - - // Copy the specified status. - Status(const Status& s); - void operator=(const Status& s); - - // Return a success status. - static Status OK() { return Status(); } - - // Return error status of an appropriate type. - static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kNotFound, msg, Slice()); - } - static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kCorruption, msg, msg2); - } - static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kNotSupported, msg, msg2); - } - static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kInvalidArgument, msg, msg2); - } - static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kIOError, msg, msg2); - } - - // Returns true iff the status indicates success. - bool ok() const { return (state_ == NULL); } - - // Returns true iff the status indicates a NotFound error. - bool IsNotFound() const { return code() == kNotFound; } - - // Return a string representation of this status suitable for printing. - // Returns the string "OK" for success. - std::string ToString() const; - - private: - enum Code { - kOk = 0, - kNotFound = 1, - kCorruption = 2, - kNotSupported = 3, - kInvalidArgument = 4, - kIOError = 5, - }; - Code code() const { return (state_ == NULL) ? kOk : state_->first; } - - Status(Code code, const Slice& msg, const Slice& msg2); - - typedef std::pair State; - State* state_; -}; - -inline Status::Status(const Status& s) { - state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); -} -inline void Status::operator=(const Status& s) { - if (this != &s) { - delete state_; - state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); - } -} - -} - -#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/leveldb/include/leveldb/table.h b/leveldb/include/leveldb/table.h deleted file mode 100644 index bd99176..0000000 --- a/leveldb/include/leveldb/table.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_ -#define STORAGE_LEVELDB_INCLUDE_TABLE_H_ - -#include -#include "leveldb/iterator.h" - -namespace leveldb { - -class Block; -class BlockHandle; -struct Options; -class RandomAccessFile; -struct ReadOptions; - -// A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. -class Table { - public: - // Attempt to open the table that is stored in bytes [0..file_size) - // of "file", and read the metadata entries necessary to allow - // retrieving data from the table. - // - // If successful, returns ok and sets "*table" to the newly opened - // table. The client should delete "*table" when no longer needed. - // If there was an error while initializing the table, sets "*table" - // to NULL and returns a non-ok status. Does not take ownership of - // "*source", but the client must ensure that "source" remains live - // for the duration of the returned table's lifetime. - // - // *file must remain live while this Table is in use. - static Status Open(const Options& options, - RandomAccessFile* file, - uint64_t file_size, - Table** table); - - ~Table(); - - // Returns a new iterator over the table contents. - // The result of NewIterator() is initially invalid (caller must - // call one of the Seek methods on the iterator before using it). - Iterator* NewIterator(const ReadOptions&) const; - - // Given a key, return an approximate byte offset in the file where - // the data for that key begins (or would begin if the key were - // present in the file). The returned value is in terms of file - // bytes, and so includes effects like compression of the underlying data. - // E.g., the approximate offset of the last key in the table will - // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key) const; - - private: - struct Rep; - Rep* rep_; - - explicit Table(Rep* rep) { rep_ = rep; } - static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); - - // No copying allowed - Table(const Table&); - void operator=(const Table&); -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ diff --git a/leveldb/include/leveldb/table_builder.h b/leveldb/include/leveldb/table_builder.h deleted file mode 100644 index 49d2d51..0000000 --- a/leveldb/include/leveldb/table_builder.h +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// TableBuilder provides the interface used to build a Table -// (an immutable and sorted map from keys to values). - -#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ -#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ - -#include -#include "leveldb/options.h" -#include "leveldb/status.h" - -namespace leveldb { - -class BlockBuilder; -class BlockHandle; -class WritableFile; - -class TableBuilder { - public: - // Create a builder that will store the contents of the table it is - // building in *file. Does not close the file. It is up to the - // caller to close the file after calling Finish(). - TableBuilder(const Options& options, WritableFile* file); - - // REQUIRES: Either Finish() or Abandon() has been called. - ~TableBuilder(); - - // Change the options used by this builder. Note: only some of the - // option fields can be changed after construction. If a field is - // not allowed to change dynamically and its value in the structure - // passed to the constructor is different from its value in the - // structure passed to this method, this method will return an error - // without changing any fields. - Status ChangeOptions(const Options& options); - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value); - - // Advanced operation: flush any buffered key/value pairs to file. - // Can be used to ensure that two adjacent entries never live in - // the same data block. Most clients should not need to use this method. - // REQUIRES: Finish(), Abandon() have not been called - void Flush(); - - // Return non-ok iff some error has been detected. - Status status() const; - - // Finish building the table. Stops using the file passed to the - // constructor after this function returns. - // REQUIRES: Finish(), Abandon() have not been called - Status Finish(); - - // Indicate that the contents of this builder should be abandoned. Stops - // using the file passed to the constructor after this function returns. - // If the caller is not going to call Finish(), it must call Abandon() - // before destroying this builder. - // REQUIRES: Finish(), Abandon() have not been called - void Abandon(); - - // Number of calls to Add() so far. - uint64_t NumEntries() const; - - // Size of the file generated so far. If invoked after a successful - // Finish() call, returns the size of the final generated file. - uint64_t FileSize() const; - - private: - bool ok() const { return status().ok(); } - void WriteBlock(BlockBuilder* block, BlockHandle* handle); - - struct Rep; - Rep* rep_; - - // No copying allowed - TableBuilder(const TableBuilder&); - void operator=(const TableBuilder&); -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/leveldb/include/leveldb/write_batch.h b/leveldb/include/leveldb/write_batch.h deleted file mode 100644 index 3411952..0000000 --- a/leveldb/include/leveldb/write_batch.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// WriteBatch holds a collection of updates to apply atomically to a DB. -// -// The updates are applied in the order in which they are added -// to the WriteBatch. For example, the value of "key" will be "v3" -// after the following batch is written: -// -// batch.Put("key", "v1"); -// batch.Delete("key"); -// batch.Put("key", "v2"); -// batch.Put("key", "v3"); - -#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ -#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ - -#include - -namespace leveldb { - -class Slice; - -class WriteBatch { - public: - WriteBatch(); - ~WriteBatch(); - - // Store the mapping "key->value" in the database. - void Put(const Slice& key, const Slice& value); - - // If the database contains a mapping for "key", erase it. Else do nothing. - void Delete(const Slice& key); - - // Clear all updates buffered in this batch. - void Clear(); - - private: - friend class WriteBatchInternal; - - std::string rep_; // See comment in write_batch.cc for the format of rep_ - - // Intentionally copyable -}; - -} - -#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/leveldb/leveldb.gyp b/leveldb/leveldb.gyp deleted file mode 100644 index 20d1b1d..0000000 --- a/leveldb/leveldb.gyp +++ /dev/null @@ -1,315 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -{ - 'variables': { - 'use_snappy%': 0, - }, - 'target_defaults': { - 'defines': [ - 'LEVELDB_PLATFORM_CHROMIUM=1', - ], - 'include_dirs': [ - '.', - 'include/', - ], - 'conditions': [ - ['OS == "win"', { - 'include_dirs': [ - 'port/win', - ], - }], - ['use_snappy', { - 'defines': [ - 'USE_SNAPPY=1', - ], - }], - ], - }, - 'targets': [ - { - 'target_name': 'leveldb', - 'type': '<(library)', - 'dependencies': [ - # The base libary is a lightweight abstraction layer for things like - # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ - '../../base/base.gyp:base', - ], - 'conditions': [ - ['use_snappy', { - 'dependencies': [ - '../../third_party/snappy/snappy.gyp:snappy', - ], - }], - ], - 'direct_dependent_settings': { - 'include_dirs': [ - 'include/', - ], - }, - 'sources': [ - # Include and then exclude so that all files show up in IDEs, even if - # they don't build. - 'db/builder.cc', - 'db/builder.h', - 'db/db_impl.cc', - 'db/db_impl.h', - 'db/db_iter.cc', - 'db/db_iter.h', - 'db/filename.cc', - 'db/filename.h', - 'db/dbformat.cc', - 'db/dbformat.h', - 'db/log_format.h', - 'db/log_reader.cc', - 'db/log_reader.h', - 'db/log_writer.cc', - 'db/log_writer.h', - 'db/memtable.cc', - 'db/memtable.h', - 'db/repair.cc', - 'db/skiplist.h', - 'db/snapshot.h', - 'db/table_cache.cc', - 'db/table_cache.h', - 'db/version_edit.cc', - 'db/version_edit.h', - 'db/version_set.cc', - 'db/version_set.h', - 'db/write_batch.cc', - 'db/write_batch_internal.h', - 'include/leveldb/cache.h', - 'include/leveldb/comparator.h', - 'include/leveldb/db.h', - 'include/leveldb/env.h', - 'include/leveldb/iterator.h', - 'include/leveldb/options.h', - 'include/leveldb/slice.h', - 'include/leveldb/status.h', - 'include/leveldb/table.h', - 'include/leveldb/table_builder.h', - 'include/leveldb/write_batch.h', - 'port/port.h', - 'port/port_chromium.cc', - 'port/port_chromium.h', - 'port/port_example.h', - 'port/port_posix.cc', - 'port/port_posix.h', - 'table/block.cc', - 'table/block.h', - 'table/block_builder.cc', - 'table/block_builder.h', - 'table/format.cc', - 'table/format.h', - 'table/iterator.cc', - 'table/iterator_wrapper.h', - 'table/merger.cc', - 'table/merger.h', - 'table/table.cc', - 'table/table_builder.cc', - 'table/two_level_iterator.cc', - 'table/two_level_iterator.h', - 'util/arena.cc', - 'util/arena.h', - 'util/cache.cc', - 'util/coding.cc', - 'util/coding.h', - 'util/comparator.cc', - 'util/crc32c.cc', - 'util/crc32c.h', - 'util/env.cc', - 'util/env_chromium.cc', - 'util/env_posix.cc', - 'util/hash.cc', - 'util/hash.h', - 'util/logging.cc', - 'util/logging.h', - 'util/mutexlock.h', - 'util/options.cc', - 'util/random.h', - 'util/status.cc', - ], - 'sources/': [ - ['exclude', '_(android|example|portable|posix)\\.cc$'], - ], - }, - { - 'target_name': 'leveldb_testutil', - 'type': '<(library)', - 'dependencies': [ - '../../base/base.gyp:base', - 'leveldb', - ], - 'export_dependent_settings': [ - # The tests use include directories from these projects. - '../../base/base.gyp:base', - 'leveldb', - ], - 'sources': [ - 'util/histogram.cc', - 'util/histogram.h', - 'util/testharness.cc', - 'util/testharness.h', - 'util/testutil.cc', - 'util/testutil.h', - ], - }, - { - 'target_name': 'leveldb_arena_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/arena_test.cc', - ], - }, - { - 'target_name': 'leveldb_cache_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/cache_test.cc', - ], - }, - { - 'target_name': 'leveldb_coding_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/coding_test.cc', - ], - }, - { - 'target_name': 'leveldb_corruption_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/corruption_test.cc', - ], - }, - { - 'target_name': 'leveldb_crc32c_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/crc32c_test.cc', - ], - }, - { - 'target_name': 'leveldb_db_bench', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/db_bench.cc', - ], - }, - { - 'target_name': 'leveldb_db_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/db_test.cc', - ], - }, - { - 'target_name': 'leveldb_dbformat_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/dbformat_test.cc', - ], - }, - { - 'target_name': 'leveldb_env_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/env_test.cc', - ], - }, - { - 'target_name': 'leveldb_filename_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/filename_test.cc', - ], - }, - { - 'target_name': 'leveldb_log_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/log_test.cc', - ], - }, - { - 'target_name': 'leveldb_skiplist_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/skiplist_test.cc', - ], - }, - { - 'target_name': 'leveldb_table_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'table/table_test.cc', - ], - }, - { - 'target_name': 'leveldb_version_edit_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/version_edit_test.cc', - ], - }, - { - 'target_name': 'leveldb_write_batch_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/write_batch_test.cc', - ], - }, - ], -} - -# Local Variables: -# tab-width:2 -# indent-tabs-mode:nil -# End: -# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/leveldb/port/README b/leveldb/port/README deleted file mode 100644 index 422563e..0000000 --- a/leveldb/port/README +++ /dev/null @@ -1,10 +0,0 @@ -This directory contains interfaces and implementations that isolate the -rest of the package from platform details. - -Code in the rest of the package includes "port.h" from this directory. -"port.h" in turn includes a platform specific "port_.h" file -that provides the platform specific implementation. - -See port_posix.h for an example of what must be provided in a platform -specific header file. - diff --git a/leveldb/port/port.h b/leveldb/port/port.h deleted file mode 100644 index 816826b..0000000 --- a/leveldb/port/port.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_PORT_PORT_H_ -#define STORAGE_LEVELDB_PORT_PORT_H_ - -#include - -// Include the appropriate platform specific file below. If you are -// porting to a new platform, see "port_example.h" for documentation -// of what the new port_.h file must provide. -#if defined(LEVELDB_PLATFORM_POSIX) -# include "port/port_posix.h" -#elif defined(LEVELDB_PLATFORM_CHROMIUM) -# include "port/port_chromium.h" -#elif defined(LEVELDB_PLATFORM_ANDROID) -# include "port/port_android.h" -#endif - -#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/leveldb/port/port_android.cc b/leveldb/port/port_android.cc deleted file mode 100644 index 240e9ca..0000000 --- a/leveldb/port/port_android.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_android.h" - -#include - -extern "C" { -size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { - return fread(a, b, c, d); -} - -size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { - return fwrite(a, b, c, d); -} - -int fflush_unlocked(FILE *f) { - return fflush(f); -} - -int fdatasync(int fd) { - return fsync(fd); -} -} - -namespace leveldb { -namespace port { - -static void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } -} - -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } -void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } -void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } - -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); -} - -CondVar::~CondVar() { - PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); -} - -void CondVar::Wait() { - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); -} - -void CondVar::Signal(){ - PthreadCall("signal", pthread_cond_signal(&cv_)); -} - -void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); -} - -} -} diff --git a/leveldb/port/port_android.h b/leveldb/port/port_android.h deleted file mode 100644 index 13df9c9..0000000 --- a/leveldb/port/port_android.h +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ -#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ - -#include -#include -#include -#include -#include -#include - -// Collapse the plethora of ARM flavors available to an easier to manage set -// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto -#if defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || \ - defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6Z__) || \ - defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7A__) -#define ARMV6_OR_7 1 -#endif - -extern "C" { - size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); - size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); - int fflush_unlocked(FILE *f); - int fdatasync (int fd); -} - -namespace leveldb { -namespace port { - -static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; - -class CondVar; - -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld() { - //TODO(gabor): How can I implement this? - } - - private: - friend class CondVar; - pthread_mutex_t mu_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - private: - Mutex* mu_; - pthread_cond_t cv_; -}; - -#ifndef ARMV6_OR_7 -// On ARM chipsets = V6 -#ifdef ARMV6_OR_7 - __asm__ __volatile__("dmb" : : : "memory"); -#else - pLinuxKernelMemoryBarrier(); -#endif - } - - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - void* r = rep_; - MemoryBarrier(); - return r; - } - inline void Release_Store(void* v) { - MemoryBarrier(); - rep_ = v; - } - inline void* NoBarrier_Load() const { - void* r = rep_; - return r; - } - inline void NoBarrier_Store(void* v) { - rep_ = v; - } -}; - -// TODO(gabor): Implement compress -inline bool Snappy_Compress( - const char* input, - size_t input_length, - std::string* output) { - return false; -} - -// TODO(gabor): Implement uncompress -inline bool Snappy_Uncompress( - const char* input_data, - size_t input_length, - std::string* output) { - return false; -} - -inline uint64_t ThreadIdentifier() { - pthread_t tid = pthread_self(); - uint64_t r = 0; - memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); - return r; -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/leveldb/port/port_chromium.cc b/leveldb/port/port_chromium.cc deleted file mode 100644 index 2ab49b9..0000000 --- a/leveldb/port/port_chromium.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_chromium.h" - -#include "util/logging.h" - -#if defined(USE_SNAPPY) -# include "third_party/snappy/src/snappy.h" -#endif - -namespace leveldb { -namespace port { - -Mutex::Mutex() { -} - -Mutex::~Mutex() { -} - -void Mutex::Lock() { - mu_.Acquire(); -} - -void Mutex::Unlock() { - mu_.Release(); -} - -void Mutex::AssertHeld() { - mu_.AssertAcquired(); -} - -CondVar::CondVar(Mutex* mu) - : cv_(&mu->mu_) { -} - -CondVar::~CondVar() { } - -void CondVar::Wait() { - cv_.Wait(); -} - -void CondVar::Signal(){ - cv_.Signal(); -} - -void CondVar::SignalAll() { - cv_.Broadcast(); -} - -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { -#if defined(USE_SNAPPY) - output->resize(snappy::MaxCompressedLength(input_length)); - size_t outlen; - snappy::RawCompress(input, input_length, &(*output)[0], &outlen); - output->resize(outlen); - return true; -#else - return false; -#endif -} - -bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { -#if defined(USE_SNAPPY) - size_t ulength; - if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { - return false; - } - output->resize(ulength); - return snappy::RawUncompress(input_data, input_length, &(*output)[0]); -#else - return false; -#endif -} - -} -} diff --git a/leveldb/port/port_chromium.h b/leveldb/port/port_chromium.h deleted file mode 100644 index 1851e6e..0000000 --- a/leveldb/port/port_chromium.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ -#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ - -#include -#include -#include -#include "base/atomicops.h" -#include "base/basictypes.h" -#include "base/logging.h" -#include "base/synchronization/condition_variable.h" -#include "base/synchronization/lock.h" - -// Linux's ThreadIdentifier() needs this. -#if defined(OS_LINUX) -# include -#endif - -#if defined(OS_WIN) -#define snprintf _snprintf -#define va_copy(a, b) do { (a) = (b); } while (0) -#endif - -namespace leveldb { -namespace port { - -// Chromium only supports little endian. -static const bool kLittleEndian = true; - -class Mutex { - public: - Mutex(); - ~Mutex(); - void Lock(); - void Unlock(); - void AssertHeld(); - - private: - base::Lock mu_; - - friend class CondVar; - DISALLOW_COPY_AND_ASSIGN(Mutex); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - - private: - base::ConditionVariable cv_; - - DISALLOW_COPY_AND_ASSIGN(CondVar); -}; - -class AtomicPointer { - private: - typedef base::subtle::AtomicWord Rep; - Rep rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} - inline void* Acquire_Load() const { - return reinterpret_cast(::base::subtle::Acquire_Load(&rep_)); - } - inline void Release_Store(void* v) { - ::base::subtle::Release_Store(&rep_, reinterpret_cast(v)); - } - inline void* NoBarrier_Load() const { - return reinterpret_cast(::base::subtle::NoBarrier_Load(&rep_)); - } - inline void NoBarrier_Store(void* v) { - ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast(v)); - } -}; - -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); -bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output); - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ diff --git a/leveldb/port/port_example.h b/leveldb/port/port_example.h deleted file mode 100644 index 8a624f3..0000000 --- a/leveldb/port/port_example.h +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// This file contains the specification, but not the implementations, -// of the types/operations/etc. that should be defined by a platform -// specific port_.h file. Use this file as a reference for -// how to port this package to a new platform. - -#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ -#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ - -namespace leveldb { -namespace port { - -// TODO(jorlow): Many of these belong more in the environment class rather than -// here. We should try moving them and see if it affects perf. - -// The following boolean constant must be true on a little-endian machine -// and false otherwise. -static const bool kLittleEndian = true /* or some other expression */; - -// ------------------ Threading ------------------- - -// A Mutex represents an exclusive lock. -class Mutex { - public: - Mutex(); - ~Mutex(); - - // Lock the mutex. Waits until other lockers have exited. - // Will deadlock if the mutex is already locked by this thread. - void Lock(); - - // Unlock the mutex. - // REQUIRES: This mutex was locked by this thread. - void Unlock(); - - // Optionally crash if this thread does not hold this mutex. - // The implementation must be fast, especially if NDEBUG is - // defined. The implementation is allowed to skip all checks. - void AssertHeld(); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - - // Atomically release *mu and block on this condition variable until - // either a call to SignalAll(), or a call to Signal() that picks - // this thread to wakeup. - // REQUIRES: this thread holds *mu - void Wait(); - - // If there are some threads waiting, wake up at least one of them. - void Signal(); - - // Wake up all waiting threads. - void SignallAll(); -}; - -// A type that holds a pointer that can be read or written atomically -// (i.e., without word-tearing.) -class AtomicPointer { - private: - intptr_t rep_; - public: - // Initialize to arbitrary value - AtomicPointer(); - - // Initialize to hold v - explicit AtomicPointer(void* v) : rep_(v) { } - - // Read and return the stored pointer with the guarantee that no - // later memory access (read or write) by this thread can be - // reordered ahead of this read. - void* Acquire_Load() const; - - // Set v as the stored pointer with the guarantee that no earlier - // memory access (read or write) by this thread can be reordered - // after this store. - void Release_Store(void* v); - - // Read the stored pointer with no ordering guarantees. - void* NoBarrier_Load() const; - - // Set va as the stored pointer with no ordering guarantees. - void NoBarrier_Store(void* v); -}; - -// ------------------ Compression ------------------- - -// Store the snappy compression of "input[0,input_length-1]" in *output. -// Returns false if snappy is not supported by this port. -extern bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); - -// Attempt to snappy uncompress input[0,input_length-1] into *output. -// Returns true if successful, false if the input is invalid lightweight -// compressed data. -extern bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output); - -// ------------------ Miscellaneous ------------------- - -// If heap profiling is not supported, returns false. -// Else repeatedly calls (*func)(arg, data, n) and then returns true. -// The concatenation of all "data[0,n-1]" fragments is the heap profile. -extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/leveldb/port/port_posix.cc b/leveldb/port/port_posix.cc deleted file mode 100644 index e75da8b..0000000 --- a/leveldb/port/port_posix.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_posix.h" - -#include -#include -#include -#include "util/logging.h" - -namespace leveldb { -namespace port { - -static void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } -} - -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } - -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } - -void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } - -void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } - -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); -} - -CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } - -void CondVar::Wait() { - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); -} - -void CondVar::Signal() { - PthreadCall("signal", pthread_cond_signal(&cv_)); -} - -void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); -} - -} -} diff --git a/leveldb/port/port_posix.h b/leveldb/port/port_posix.h deleted file mode 100644 index c158db1..0000000 --- a/leveldb/port/port_posix.h +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ -#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ - -#include -#include -#include -#include -#include -#include - -namespace leveldb { -namespace port { - -static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); - -class CondVar; - -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld() { } - - private: - friend class CondVar; - pthread_mutex_t mu_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - private: - pthread_cond_t cv_; - Mutex* mu_; -}; - -// Storage for a lock-free pointer -class AtomicPointer { - private: - std::atomic rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); - } - inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); - } - inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); - } - inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); - } -}; - -// TODO(gabor): Implement actual compress -inline bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { - return false; -} - -// TODO(gabor): Implement actual uncompress -inline bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { - return false; -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/leveldb/port/win/stdint.h b/leveldb/port/win/stdint.h deleted file mode 100644 index 39edd0d..0000000 --- a/leveldb/port/win/stdint.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// MSVC didn't ship with this file until the 2010 version. - -#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ -#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ - -#if !defined(_MSC_VER) -#error This file should only be included when compiling with MSVC. -#endif - -// Define C99 equivalent types. -typedef signed char int8_t; -typedef signed short int16_t; -typedef signed int int32_t; -typedef signed long long int64_t; -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/leveldb/table/block.cc b/leveldb/table/block.cc deleted file mode 100644 index 92b2877..0000000 --- a/leveldb/table/block.cc +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Decodes the blocks generated by block_builder.cc. - -#include "table/block.h" - -#include -#include -#include "leveldb/comparator.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -inline uint32_t Block::NumRestarts() const { - assert(size_ >= 2*sizeof(uint32_t)); - return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); -} - -Block::Block(const char* data, size_t size) - : data_(data), - size_(size) { - if (size_ < sizeof(uint32_t)) { - size_ = 0; // Error marker - } else { - restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); - if (restart_offset_ > size_ - sizeof(uint32_t)) { - // The size is too small for NumRestarts() and therefore - // restart_offset_ wrapped around. - size_ = 0; - } - } -} - -Block::~Block() { - delete[] data_; -} - -// Helper routine: decode the next block entry starting at "p", -// storing the number of shared key bytes, non_shared key bytes, -// and the length of the value in "*shared", "*non_shared", and -// "*value_length", respectively. Will not derefence past "limit". -// -// If any errors are detected, returns NULL. Otherwise, returns a -// pointer to the key delta (just past the three decoded values). -static inline const char* DecodeEntry(const char* p, const char* limit, - uint32_t* shared, - uint32_t* non_shared, - uint32_t* value_length) { - if (limit - p < 3) return NULL; - *shared = reinterpret_cast(p)[0]; - *non_shared = reinterpret_cast(p)[1]; - *value_length = reinterpret_cast(p)[2]; - if ((*shared | *non_shared | *value_length) < 128) { - // Fast path: all three values are encoded in one byte each - p += 3; - } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; - if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; - } - - if (static_cast(limit - p) < (*non_shared + *value_length)) { - return NULL; - } - return p; -} - -class Block::Iter : public Iterator { - private: - const Comparator* const comparator_; - const char* const data_; // underlying block contents - uint32_t const restarts_; // Offset of restart array (list of fixed32) - uint32_t const num_restarts_; // Number of uint32_t entries in restart array - - // current_ is offset in data_ of current entry. >= restarts_ if !Valid - uint32_t current_; - uint32_t restart_index_; // Index of restart block in which current_ falls - std::string key_; - Slice value_; - Status status_; - - inline int Compare(const Slice& a, const Slice& b) const { - return comparator_->Compare(a, b); - } - - // Return the offset in data_ just past the end of the current entry. - inline uint32_t NextEntryOffset() const { - return (value_.data() + value_.size()) - data_; - } - - uint32_t GetRestartPoint(uint32_t index) { - assert(index < num_restarts_); - return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); - } - - void SeekToRestartPoint(uint32_t index) { - key_.clear(); - restart_index_ = index; - // current_ will be fixed by ParseNextKey(); - - // ParseNextKey() starts at the end of value_, so set value_ accordingly - uint32_t offset = GetRestartPoint(index); - value_ = Slice(data_ + offset, 0); - } - - public: - Iter(const Comparator* comparator, - const char* data, - uint32_t restarts, - uint32_t num_restarts) - : comparator_(comparator), - data_(data), - restarts_(restarts), - num_restarts_(num_restarts), - current_(restarts_), - restart_index_(num_restarts_) { - assert(num_restarts_ > 0); - } - - virtual bool Valid() const { return current_ < restarts_; } - virtual Status status() const { return status_; } - virtual Slice key() const { - assert(Valid()); - return key_; - } - virtual Slice value() const { - assert(Valid()); - return value_; - } - - virtual void Next() { - assert(Valid()); - ParseNextKey(); - } - - virtual void Prev() { - assert(Valid()); - - // Scan backwards to a restart point before current_ - const uint32_t original = current_; - while (GetRestartPoint(restart_index_) >= original) { - if (restart_index_ == 0) { - // No more entries - current_ = restarts_; - restart_index_ = num_restarts_; - return; - } - restart_index_--; - } - - SeekToRestartPoint(restart_index_); - do { - // Loop until end of current entry hits the start of original entry - } while (ParseNextKey() && NextEntryOffset() < original); - } - - virtual void Seek(const Slice& target) { - // Binary search in restart array to find the first restart point - // with a key >= target - uint32_t left = 0; - uint32_t right = num_restarts_ - 1; - while (left < right) { - uint32_t mid = (left + right + 1) / 2; - uint32_t region_offset = GetRestartPoint(mid); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, - data_ + restarts_, - &shared, &non_shared, &value_length); - if (key_ptr == NULL || (shared != 0)) { - CorruptionError(); - return; - } - Slice mid_key(key_ptr, non_shared); - if (Compare(mid_key, target) < 0) { - // Key at "mid" is smaller than "target". Therefore all - // blocks before "mid" are uninteresting. - left = mid; - } else { - // Key at "mid" is >= "target". Therefore all blocks at or - // after "mid" are uninteresting. - right = mid - 1; - } - } - - // Linear search (within restart block) for first key >= target - SeekToRestartPoint(left); - while (true) { - if (!ParseNextKey()) { - return; - } - if (Compare(key_, target) >= 0) { - return; - } - } - } - - virtual void SeekToFirst() { - SeekToRestartPoint(0); - ParseNextKey(); - } - - virtual void SeekToLast() { - SeekToRestartPoint(num_restarts_ - 1); - while (ParseNextKey() && NextEntryOffset() < restarts_) { - // Keep skipping - } - } - - private: - void CorruptionError() { - current_ = restarts_; - restart_index_ = num_restarts_; - status_ = Status::Corruption("bad entry in block"); - key_.clear(); - value_.clear(); - } - - bool ParseNextKey() { - current_ = NextEntryOffset(); - const char* p = data_ + current_; - const char* limit = data_ + restarts_; // Restarts come right after data - if (p >= limit) { - // No more entries to return. Mark as invalid. - current_ = restarts_; - restart_index_ = num_restarts_; - return false; - } - - // Decode next entry - uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); - if (p == NULL || key_.size() < shared) { - CorruptionError(); - return false; - } else { - key_.resize(shared); - key_.append(p, non_shared); - value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; - } - return true; - } - } -}; - -Iterator* Block::NewIterator(const Comparator* cmp) { - if (size_ < 2*sizeof(uint32_t)) { - return NewErrorIterator(Status::Corruption("bad block contents")); - } - const uint32_t num_restarts = NumRestarts(); - if (num_restarts == 0) { - return NewEmptyIterator(); - } else { - return new Iter(cmp, data_, restart_offset_, num_restarts); - } -} - -} diff --git a/leveldb/table/block.h b/leveldb/table/block.h deleted file mode 100644 index cdf0598..0000000 --- a/leveldb/table/block.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ -#define STORAGE_LEVELDB_TABLE_BLOCK_H_ - -#include -#include -#include "leveldb/iterator.h" - -namespace leveldb { - -class Comparator; - -class Block { - public: - // Initialize the block with the specified contents. - // Takes ownership of data[] and will delete[] it when done. - Block(const char* data, size_t size); - - ~Block(); - - size_t size() const { return size_; } - Iterator* NewIterator(const Comparator* comparator); - - private: - uint32_t NumRestarts() const; - - const char* data_; - size_t size_; - uint32_t restart_offset_; // Offset in data_ of restart array - - // No copying allowed - Block(const Block&); - void operator=(const Block&); - - class Iter; -}; - -} - -#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ diff --git a/leveldb/table/block_builder.cc b/leveldb/table/block_builder.cc deleted file mode 100644 index dc958c8..0000000 --- a/leveldb/table/block_builder.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// BlockBuilder generates blocks where keys are prefix-compressed: -// -// When we store a key, we drop the prefix shared with the previous -// string. This helps reduce the space requirement significantly. -// Furthermore, once every K keys, we do not apply the prefix -// compression and store the entire key. We call this a "restart -// point". The tail end of the block stores the offsets of all of the -// restart points, and can be used to do a binary search when looking -// for a particular key. Values are stored as-is (without compression) -// immediately following the corresponding key. -// -// An entry for a particular key-value pair has the form: -// shared_bytes: varint32 -// unshared_bytes: varint32 -// value_length: varint32 -// key_delta: char[unshared_bytes] -// value: char[value_length] -// shared_bytes == 0 for restart points. -// -// The trailer of the block has the form: -// restarts: uint32[num_restarts] -// num_restarts: uint32 -// restarts[i] contains the offset within the block of the ith restart point. - -#include "table/block_builder.h" - -#include -#include -#include "leveldb/comparator.h" -#include "leveldb/table_builder.h" -#include "util/coding.h" - -namespace leveldb { - -BlockBuilder::BlockBuilder(const Options* options) - : options_(options), - restarts_(), - counter_(0), - finished_(false) { - assert(options->block_restart_interval >= 1); - restarts_.push_back(0); // First restart point is at offset 0 -} - -void BlockBuilder::Reset() { - buffer_.clear(); - restarts_.clear(); - restarts_.push_back(0); // First restart point is at offset 0 - counter_ = 0; - finished_ = false; - last_key_.clear(); -} - -size_t BlockBuilder::CurrentSizeEstimate() const { - return (buffer_.size() + // Raw data buffer - restarts_.size() * sizeof(uint32_t) + // Restart array - sizeof(uint32_t)); // Restart array length -} - -Slice BlockBuilder::Finish() { - // Append restart array - for (size_t i = 0; i < restarts_.size(); i++) { - PutFixed32(&buffer_, restarts_[i]); - } - PutFixed32(&buffer_, restarts_.size()); - finished_ = true; - return Slice(buffer_); -} - -void BlockBuilder::Add(const Slice& key, const Slice& value) { - Slice last_key_piece(last_key_); - assert(!finished_); - assert(counter_ <= options_->block_restart_interval); - assert(buffer_.empty() // No values yet? - || options_->comparator->Compare(key, last_key_piece) > 0); - size_t shared = 0; - if (counter_ < options_->block_restart_interval) { - // See how much sharing to do with previous string - const size_t min_length = std::min(last_key_piece.size(), key.size()); - while ((shared < min_length) && (last_key_[shared] == key[shared])) { - shared++; - } - } else { - // Restart compression - restarts_.push_back(buffer_.size()); - counter_ = 0; - } - const size_t non_shared = key.size() - shared; - - // Add "" to buffer_ - PutVarint32(&buffer_, shared); - PutVarint32(&buffer_, non_shared); - PutVarint32(&buffer_, value.size()); - - // Add string delta to buffer_ followed by value - buffer_.append(key.data() + shared, non_shared); - buffer_.append(value.data(), value.size()); - - // Update state - last_key_.resize(shared); - last_key_.append(key.data() + shared, non_shared); - assert(Slice(last_key_) == key); - counter_++; -} - -} diff --git a/leveldb/table/block_builder.h b/leveldb/table/block_builder.h deleted file mode 100644 index bf92a0f..0000000 --- a/leveldb/table/block_builder.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ -#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ - -#include - -#include -#include "leveldb/slice.h" - -namespace leveldb { - -struct Options; - -class BlockBuilder { - public: - explicit BlockBuilder(const Options* options); - - // Reset the contents as if the BlockBuilder was just constructed. - void Reset(); - - // REQUIRES: Finish() has not been callled since the last call to Reset(). - // REQUIRES: key is larger than any previously added key - void Add(const Slice& key, const Slice& value); - - // Finish building the block and return a slice that refers to the - // block contents. The returned slice will remain valid for the - // lifetime of this builder or until Reset() is called. - Slice Finish(); - - // Returns an estimate of the current (uncompressed) size of the block - // we are building. - size_t CurrentSizeEstimate() const; - - // Return true iff no entries have been added since the last Reset() - bool empty() const { - return buffer_.empty(); - } - - private: - const Options* options_; - std::string buffer_; // Destination buffer - std::vector restarts_; // Restart points - int counter_; // Number of entries emitted since restart - bool finished_; // Has Finish() been called? - std::string last_key_; - - // No copying allowed - BlockBuilder(const BlockBuilder&); - void operator=(const BlockBuilder&); -}; - -} - -#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ diff --git a/leveldb/table/format.cc b/leveldb/table/format.cc deleted file mode 100644 index 63971db..0000000 --- a/leveldb/table/format.cc +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/format.h" - -#include "leveldb/env.h" -#include "port/port.h" -#include "table/block.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { - -void BlockHandle::EncodeTo(std::string* dst) const { - // Sanity check that all fields have been set - assert(offset_ != ~static_cast(0)); - assert(size_ != ~static_cast(0)); - PutVarint64(dst, offset_); - PutVarint64(dst, size_); -} - -Status BlockHandle::DecodeFrom(Slice* input) { - if (GetVarint64(input, &offset_) && - GetVarint64(input, &size_)) { - return Status::OK(); - } else { - return Status::Corruption("bad block handle"); - } -} - -void Footer::EncodeTo(std::string* dst) const { -#ifndef NDEBUG - const size_t original_size = dst->size(); -#endif - metaindex_handle_.EncodeTo(dst); - index_handle_.EncodeTo(dst); - dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding - PutFixed32(dst, static_cast(kTableMagicNumber & 0xffffffffu)); - PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); - assert(dst->size() == original_size + kEncodedLength); -} - -Status Footer::DecodeFrom(Slice* input) { - const char* magic_ptr = input->data() + kEncodedLength - 8; - const uint32_t magic_lo = DecodeFixed32(magic_ptr); - const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); - const uint64_t magic = ((static_cast(magic_hi) << 32) | - (static_cast(magic_lo))); - if (magic != kTableMagicNumber) { - return Status::InvalidArgument("not an sstable (bad magic number)"); - } - - Status result = metaindex_handle_.DecodeFrom(input); - if (result.ok()) { - result = index_handle_.DecodeFrom(input); - } - if (result.ok()) { - // We skip over any leftover data (just padding for now) in "input" - const char* end = magic_ptr + 8; - *input = Slice(end, input->data() + input->size() - end); - } - return result; -} - -Status ReadBlock(RandomAccessFile* file, - const ReadOptions& options, - const BlockHandle& handle, - Block** block) { - *block = NULL; - - // Read the block contents as well as the type/crc footer. - // See table_builder.cc for the code that built this structure. - size_t n = static_cast(handle.size()); - char* buf = new char[n + kBlockTrailerSize]; - Slice contents; - Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); - if (!s.ok()) { - delete[] buf; - return s; - } - if (contents.size() != n + kBlockTrailerSize) { - delete[] buf; - return Status::Corruption("truncated block read"); - } - - // Check the crc of the type and the block contents - const char* data = contents.data(); // Pointer to where Read put the data - if (options.verify_checksums) { - const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); - const uint32_t actual = crc32c::Value(data, n + 1); - if (actual != crc) { - delete[] buf; - s = Status::Corruption("block checksum mismatch"); - return s; - } - } - - switch (data[n]) { - case kNoCompression: - if (data != buf) { - // File implementation gave us pointer to some other data. - // Copy into buf[]. - memcpy(buf, data, n + kBlockTrailerSize); - } - - // Ok - break; - case kSnappyCompression: { - std::string decompressed; - if (!port::Snappy_Uncompress(data, n, &decompressed)) { - delete[] buf; - s = Status::Corruption("corrupted compressed block contents"); - return s; - } - delete[] buf; // Done with uncompressed data - buf = new char[decompressed.size()]; - memcpy(buf, decompressed.data(), decompressed.size()); - n = decompressed.size(); - break; - } - default: - delete[] buf; - return Status::Corruption("bad block type"); - } - - *block = new Block(buf, n); // Block takes ownership of buf[] - return Status::OK(); -} - -} diff --git a/leveldb/table/format.h b/leveldb/table/format.h deleted file mode 100644 index a6ab964..0000000 --- a/leveldb/table/format.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ -#define STORAGE_LEVELDB_TABLE_FORMAT_H_ - -#include -#include -#include "leveldb/slice.h" -#include "leveldb/status.h" -#include "leveldb/table_builder.h" - -namespace leveldb { - -class Block; -class RandomAccessFile; -struct ReadOptions; - -// BlockHandle is a pointer to the extent of a file that stores a data -// block or a meta block. -class BlockHandle { - public: - BlockHandle(); - - // The offset of the block in the file. - uint64_t offset() const { return offset_; } - void set_offset(uint64_t offset) { offset_ = offset; } - - // The size of the stored block - uint64_t size() const { return size_; } - void set_size(uint64_t size) { size_ = size; } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(Slice* input); - - // Maximum encoding length of a BlockHandle - enum { kMaxEncodedLength = 10 + 10 }; - - private: - uint64_t offset_; - uint64_t size_; -}; - -// Footer encapsulates the fixed information stored at the tail -// end of every table file. -class Footer { - public: - Footer() { } - - // The block handle for the metaindex block of the table - const BlockHandle& metaindex_handle() const { return metaindex_handle_; } - void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } - - // The block handle for the index block of the table - const BlockHandle& index_handle() const { - return index_handle_; - } - void set_index_handle(const BlockHandle& h) { - index_handle_ = h; - } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(Slice* input); - - // Encoded length of a Footer. Note that the serialization of a - // Footer will always occupy exactly this many bytes. It consists - // of two block handles and a magic number. - enum { - kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 - }; - - private: - BlockHandle metaindex_handle_; - BlockHandle index_handle_; -}; - -// kTableMagicNumber was picked by running -// echo http://code.google.com/p/leveldb/ | sha1sum -// and taking the leading 64 bits. -static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; - -// 1-byte type + 32-bit crc -static const size_t kBlockTrailerSize = 5; - -// Read the block identified by "handle" from "file". On success, -// store a pointer to the heap-allocated result in *block and return -// OK. On failure store NULL in *block and return non-OK. -extern Status ReadBlock(RandomAccessFile* file, - const ReadOptions& options, - const BlockHandle& handle, - Block** block); - -// Implementation details follow. Clients should ignore, - -inline BlockHandle::BlockHandle() - : offset_(~static_cast(0)), - size_(~static_cast(0)) { -} - -} - -#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ diff --git a/leveldb/table/iterator.cc b/leveldb/table/iterator.cc deleted file mode 100644 index 4ddd55f..0000000 --- a/leveldb/table/iterator.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/iterator.h" -#include "util/logging.h" - -namespace leveldb { - -Iterator::Iterator() { - cleanup_.function = NULL; - cleanup_.next = NULL; -} - -Iterator::~Iterator() { - if (cleanup_.function != NULL) { - (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); - for (Cleanup* c = cleanup_.next; c != NULL; ) { - (*c->function)(c->arg1, c->arg2); - Cleanup* next = c->next; - delete c; - c = next; - } - } -} - -void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { - assert(func != NULL); - Cleanup* c; - if (cleanup_.function == NULL) { - c = &cleanup_; - } else { - c = new Cleanup; - c->next = cleanup_.next; - cleanup_.next = c; - } - c->function = func; - c->arg1 = arg1; - c->arg2 = arg2; -} - -namespace { -class EmptyIterator : public Iterator { - public: - EmptyIterator(const Status& s) : status_(s) { } - virtual bool Valid() const { return false; } - virtual void Seek(const Slice& target) { } - virtual void SeekToFirst() { } - virtual void SeekToLast() { } - virtual void Next() { assert(false); } - virtual void Prev() { assert(false); } - Slice key() const { assert(false); return Slice(); } - Slice value() const { assert(false); return Slice(); } - virtual Status status() const { return status_; } - private: - Status status_; -}; -} - -Iterator* NewEmptyIterator() { - return new EmptyIterator(Status::OK()); -} - -Iterator* NewErrorIterator(const Status& status) { - return new EmptyIterator(status); -} - -} diff --git a/leveldb/table/iterator_wrapper.h b/leveldb/table/iterator_wrapper.h deleted file mode 100644 index 158d3a7..0000000 --- a/leveldb/table/iterator_wrapper.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ -#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ - -namespace leveldb { - -// A internal wrapper class with an interface similar to Iterator that -// caches the valid() and key() results for an underlying iterator. -// This can help avoid virtual function calls and also gives better -// cache locality. -class IteratorWrapper { - private: - Iterator* iter_; - bool valid_; - Slice key_; - public: - IteratorWrapper(): iter_(NULL), valid_(false) { } - explicit IteratorWrapper(Iterator* iter): iter_(NULL) { - Set(iter); - } - ~IteratorWrapper() { delete iter_; } - Iterator* iter() const { return iter_; } - - // Takes ownership of "iter" and will delete it when destroyed, or - // when Set() is invoked again. - void Set(Iterator* iter) { - delete iter_; - iter_ = iter; - if (iter_ == NULL) { - valid_ = false; - } else { - Update(); - } - } - - - // Iterator interface methods - bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } - Slice value() const { assert(Valid()); return iter_->value(); } - // Methods below require iter() != NULL - Status status() const { assert(iter_); return iter_->status(); } - void Next() { assert(iter_); iter_->Next(); Update(); } - void Prev() { assert(iter_); iter_->Prev(); Update(); } - void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } - void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } - void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } - - private: - void Update() { - valid_ = iter_->Valid(); - if (valid_) { - key_ = iter_->key(); - } - } -}; - -} - - -#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ diff --git a/leveldb/table/merger.cc b/leveldb/table/merger.cc deleted file mode 100644 index 6ce06bb..0000000 --- a/leveldb/table/merger.cc +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/merger.h" - -#include "leveldb/comparator.h" -#include "leveldb/iterator.h" -#include "table/iterator_wrapper.h" - -namespace leveldb { - -namespace { -class MergingIterator : public Iterator { - public: - MergingIterator(const Comparator* comparator, Iterator** children, int n) - : comparator_(comparator), - children_(new IteratorWrapper[n]), - n_(n), - current_(NULL), - direction_(kForward) { - for (int i = 0; i < n; i++) { - children_[i].Set(children[i]); - } - } - - virtual ~MergingIterator() { - delete[] children_; - } - - virtual bool Valid() const { - return (current_ != NULL); - } - - virtual void SeekToFirst() { - for (int i = 0; i < n_; i++) { - children_[i].SeekToFirst(); - } - FindSmallest(); - direction_ = kForward; - } - - virtual void SeekToLast() { - for (int i = 0; i < n_; i++) { - children_[i].SeekToLast(); - } - FindLargest(); - direction_ = kReverse; - } - - virtual void Seek(const Slice& target) { - for (int i = 0; i < n_; i++) { - children_[i].Seek(target); - } - FindSmallest(); - direction_ = kForward; - } - - virtual void Next() { - assert(Valid()); - - // Ensure that all children are positioned after key(). - // If we are moving in the forward direction, it is already - // true for all of the non-current_ children since current_ is - // the smallest child and key() == current_->key(). Otherwise, - // we explicitly position the non-current_ children. - if (direction_ != kForward) { - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child != current_) { - child->Seek(key()); - if (child->Valid() && - comparator_->Compare(key(), child->key()) == 0) { - child->Next(); - } - } - } - direction_ = kForward; - } - - current_->Next(); - FindSmallest(); - } - - virtual void Prev() { - assert(Valid()); - - // Ensure that all children are positioned before key(). - // If we are moving in the reverse direction, it is already - // true for all of the non-current_ children since current_ is - // the largest child and key() == current_->key(). Otherwise, - // we explicitly position the non-current_ children. - if (direction_ != kReverse) { - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child != current_) { - child->Seek(key()); - if (child->Valid()) { - // Child is at first entry >= key(). Step back one to be < key() - child->Prev(); - } else { - // Child has no entries >= key(). Position at last entry. - child->SeekToLast(); - } - } - } - direction_ = kReverse; - } - - current_->Prev(); - FindLargest(); - } - - virtual Slice key() const { - assert(Valid()); - return current_->key(); - } - - virtual Slice value() const { - assert(Valid()); - return current_->value(); - } - - virtual Status status() const { - Status status; - for (int i = 0; i < n_; i++) { - status = children_[i].status(); - if (!status.ok()) { - break; - } - } - return status; - } - - private: - void FindSmallest(); - void FindLargest(); - - // We might want to use a heap in case there are lots of children. - // For now we use a simple array since we expect a very small number - // of children in leveldb. - const Comparator* comparator_; - IteratorWrapper* children_; - int n_; - IteratorWrapper* current_; - - // Which direction is the iterator moving? - enum Direction { - kForward, - kReverse - }; - Direction direction_; -}; - -void MergingIterator::FindSmallest() { - IteratorWrapper* smallest = NULL; - for (int i = 0; i < n_; i++) { - IteratorWrapper* child = &children_[i]; - if (child->Valid()) { - if (smallest == NULL) { - smallest = child; - } else if (comparator_->Compare(child->key(), smallest->key()) < 0) { - smallest = child; - } - } - } - current_ = smallest; -} - -void MergingIterator::FindLargest() { - IteratorWrapper* largest = NULL; - for (int i = n_-1; i >= 0; i--) { - IteratorWrapper* child = &children_[i]; - if (child->Valid()) { - if (largest == NULL) { - largest = child; - } else if (comparator_->Compare(child->key(), largest->key()) > 0) { - largest = child; - } - } - } - current_ = largest; -} -} - -Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { - assert(n >= 0); - if (n == 0) { - return NewEmptyIterator(); - } else if (n == 1) { - return list[0]; - } else { - return new MergingIterator(cmp, list, n); - } -} - -} diff --git a/leveldb/table/merger.h b/leveldb/table/merger.h deleted file mode 100644 index 71d9dc5..0000000 --- a/leveldb/table/merger.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ -#define STORAGE_LEVELDB_TABLE_MERGER_H_ - -namespace leveldb { - -class Comparator; -class Iterator; - -// Return an iterator that provided the union of the data in -// children[0,n-1]. Takes ownership of the child iterators and -// will delete them when the result iterator is deleted. -// -// The result does no duplicate suppression. I.e., if a particular -// key is present in K child iterators, it will be yielded K times. -// -// REQUIRES: n >= 0 -extern Iterator* NewMergingIterator( - const Comparator* comparator, Iterator** children, int n); - -} - -#endif // STORAGE_LEVELDB_TABLE_MERGER_H_ diff --git a/leveldb/table/table.cc b/leveldb/table/table.cc deleted file mode 100644 index 9820753..0000000 --- a/leveldb/table/table.cc +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table.h" - -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "table/block.h" -#include "table/format.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" - -namespace leveldb { - -struct Table::Rep { - ~Rep() { - delete index_block; - } - - Options options; - Status status; - RandomAccessFile* file; - uint64_t cache_id; - - BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer - Block* index_block; -}; - -Status Table::Open(const Options& options, - RandomAccessFile* file, - uint64_t size, - Table** table) { - *table = NULL; - if (size < Footer::kEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); - } - - char footer_space[Footer::kEncodedLength]; - Slice footer_input; - Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, - &footer_input, footer_space); - if (!s.ok()) return s; - - Footer footer; - s = footer.DecodeFrom(&footer_input); - if (!s.ok()) return s; - - // Read the index block - Block* index_block = NULL; - if (s.ok()) { - s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); - } - - if (s.ok()) { - // We've successfully read the footer and the index block: we're - // ready to serve requests. - Rep* rep = new Table::Rep; - rep->options = options; - rep->file = file; - rep->metaindex_handle = footer.metaindex_handle(); - rep->index_block = index_block; - rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); - *table = new Table(rep); - } else { - if (index_block) delete index_block; - } - - return s; -} - -Table::~Table() { - delete rep_; -} - -static void DeleteBlock(void* arg, void* ignored) { - delete reinterpret_cast(arg); -} - -static void DeleteCachedBlock(const Slice& key, void* value) { - Block* block = reinterpret_cast(value); - delete block; -} - -static void ReleaseBlock(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle); -} - -// Convert an index iterator value (i.e., an encoded BlockHandle) -// into an iterator over the contents of the corresponding block. -Iterator* Table::BlockReader(void* arg, - const ReadOptions& options, - const Slice& index_value) { - Table* table = reinterpret_cast(arg); - Cache* block_cache = table->rep_->options.block_cache; - Block* block = NULL; - Cache::Handle* cache_handle = NULL; - - BlockHandle handle; - Slice input = index_value; - Status s = handle.DecodeFrom(&input); - // We intentionally allow extra stuff in index_value so that we - // can add more features in the future. - - if (s.ok()) { - if (block_cache != NULL) { - char cache_key_buffer[16]; - EncodeFixed64(cache_key_buffer, table->rep_->cache_id); - EncodeFixed64(cache_key_buffer+8, handle.offset()); - Slice key(cache_key_buffer, sizeof(cache_key_buffer)); - cache_handle = block_cache->Lookup(key); - if (cache_handle != NULL) { - block = reinterpret_cast(block_cache->Value(cache_handle)); - } else { - s = ReadBlock(table->rep_->file, options, handle, &block); - if (s.ok() && options.fill_cache) { - cache_handle = block_cache->Insert( - key, block, block->size(), &DeleteCachedBlock); - } - } - } else { - s = ReadBlock(table->rep_->file, options, handle, &block); - } - } - - Iterator* iter; - if (block != NULL) { - iter = block->NewIterator(table->rep_->options.comparator); - if (cache_handle == NULL) { - iter->RegisterCleanup(&DeleteBlock, block, NULL); - } else { - iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); - } - } else { - iter = NewErrorIterator(s); - } - return iter; -} - -Iterator* Table::NewIterator(const ReadOptions& options) const { - return NewTwoLevelIterator( - rep_->index_block->NewIterator(rep_->options.comparator), - &Table::BlockReader, const_cast(this), options); -} - -uint64_t Table::ApproximateOffsetOf(const Slice& key) const { - Iterator* index_iter = - rep_->index_block->NewIterator(rep_->options.comparator); - index_iter->Seek(key); - uint64_t result; - if (index_iter->Valid()) { - BlockHandle handle; - Slice input = index_iter->value(); - Status s = handle.DecodeFrom(&input); - if (s.ok()) { - result = handle.offset(); - } else { - // Strange: we can't decode the block handle in the index block. - // We'll just return the offset of the metaindex block, which is - // close to the whole file size for this case. - result = rep_->metaindex_handle.offset(); - } - } else { - // key is past the last key in the file. Approximate the offset - // by returning the offset of the metaindex block (which is - // right near the end of the file). - result = rep_->metaindex_handle.offset(); - } - delete index_iter; - return result; -} - -} diff --git a/leveldb/table/table_builder.cc b/leveldb/table/table_builder.cc deleted file mode 100644 index 7ec7ad2..0000000 --- a/leveldb/table/table_builder.cc +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table_builder.h" - -#include -#include -#include "leveldb/comparator.h" -#include "leveldb/env.h" -#include "table/block_builder.h" -#include "table/format.h" -#include "util/coding.h" -#include "util/crc32c.h" -#include "util/logging.h" - -namespace leveldb { - -struct TableBuilder::Rep { - Options options; - Options index_block_options; - WritableFile* file; - uint64_t offset; - Status status; - BlockBuilder data_block; - BlockBuilder index_block; - std::string last_key; - int64_t num_entries; - bool closed; // Either Finish() or Abandon() has been called. - - // We do not emit the index entry for a block until we have seen the - // first key for the next data block. This allows us to use shorter - // keys in the index block. For example, consider a block boundary - // between the keys "the quick brown fox" and "the who". We can use - // "the r" as the key for the index block entry since it is >= all - // entries in the first block and < all entries in subsequent - // blocks. - // - // Invariant: r->pending_index_entry is true only if data_block is empty. - bool pending_index_entry; - BlockHandle pending_handle; // Handle to add to index block - - std::string compressed_output; - - Rep(const Options& opt, WritableFile* f) - : options(opt), - index_block_options(opt), - file(f), - offset(0), - data_block(&options), - index_block(&index_block_options), - num_entries(0), - closed(false), - pending_index_entry(false) { - index_block_options.block_restart_interval = 1; - } -}; - -TableBuilder::TableBuilder(const Options& options, WritableFile* file) - : rep_(new Rep(options, file)) { -} - -TableBuilder::~TableBuilder() { - assert(rep_->closed); // Catch errors where caller forgot to call Finish() - delete rep_; -} - -Status TableBuilder::ChangeOptions(const Options& options) { - // Note: if more fields are added to Options, update - // this function to catch changes that should not be allowed to - // change in the middle of building a Table. - if (options.comparator != rep_->options.comparator) { - return Status::InvalidArgument("changing comparator while building table"); - } - - // Note that any live BlockBuilders point to rep_->options and therefore - // will automatically pick up the updated options. - rep_->options = options; - rep_->index_block_options = options; - rep_->index_block_options.block_restart_interval = 1; - return Status::OK(); -} - -void TableBuilder::Add(const Slice& key, const Slice& value) { - Rep* r = rep_; - assert(!r->closed); - if (!ok()) return; - if (r->num_entries > 0) { - assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); - } - - if (r->pending_index_entry) { - assert(r->data_block.empty()); - r->options.comparator->FindShortestSeparator(&r->last_key, key); - std::string handle_encoding; - r->pending_handle.EncodeTo(&handle_encoding); - r->index_block.Add(r->last_key, Slice(handle_encoding)); - r->pending_index_entry = false; - } - - r->last_key.assign(key.data(), key.size()); - r->num_entries++; - r->data_block.Add(key, value); - - const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); - if (estimated_block_size >= r->options.block_size) { - Flush(); - } -} - -void TableBuilder::Flush() { - Rep* r = rep_; - assert(!r->closed); - if (!ok()) return; - if (r->data_block.empty()) return; - assert(!r->pending_index_entry); - WriteBlock(&r->data_block, &r->pending_handle); - if (ok()) { - r->pending_index_entry = true; - r->status = r->file->Flush(); - } -} - -void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { - // File format contains a sequence of blocks where each block has: - // block_data: uint8[n] - // type: uint8 - // crc: uint32 - assert(ok()); - Rep* r = rep_; - Slice raw = block->Finish(); - - Slice block_contents; - CompressionType type = r->options.compression; - // TODO(postrelease): Support more compression options: zlib? - switch (type) { - case kNoCompression: - block_contents = raw; - break; - - case kSnappyCompression: { - std::string* compressed = &r->compressed_output; - if (port::Snappy_Compress(raw.data(), raw.size(), compressed) && - compressed->size() < raw.size() - (raw.size() / 8u)) { - block_contents = *compressed; - } else { - // Snappy not supported, or compressed less than 12.5%, so just - // store uncompressed form - block_contents = raw; - type = kNoCompression; - } - break; - } - } - handle->set_offset(r->offset); - handle->set_size(block_contents.size()); - r->status = r->file->Append(block_contents); - if (r->status.ok()) { - char trailer[kBlockTrailerSize]; - trailer[0] = type; - uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); - crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type - EncodeFixed32(trailer+1, crc32c::Mask(crc)); - r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); - if (r->status.ok()) { - r->offset += block_contents.size() + kBlockTrailerSize; - } - } - r->compressed_output.clear(); - block->Reset(); -} - -Status TableBuilder::status() const { - return rep_->status; -} - -Status TableBuilder::Finish() { - Rep* r = rep_; - Flush(); - assert(!r->closed); - r->closed = true; - BlockHandle metaindex_block_handle; - BlockHandle index_block_handle; - if (ok()) { - BlockBuilder meta_index_block(&r->options); - // TODO(postrelease): Add stats and other meta blocks - WriteBlock(&meta_index_block, &metaindex_block_handle); - } - if (ok()) { - if (r->pending_index_entry) { - r->options.comparator->FindShortSuccessor(&r->last_key); - std::string handle_encoding; - r->pending_handle.EncodeTo(&handle_encoding); - r->index_block.Add(r->last_key, Slice(handle_encoding)); - r->pending_index_entry = false; - } - WriteBlock(&r->index_block, &index_block_handle); - } - if (ok()) { - Footer footer; - footer.set_metaindex_handle(metaindex_block_handle); - footer.set_index_handle(index_block_handle); - std::string footer_encoding; - footer.EncodeTo(&footer_encoding); - r->status = r->file->Append(footer_encoding); - if (r->status.ok()) { - r->offset += footer_encoding.size(); - } - } - return r->status; -} - -void TableBuilder::Abandon() { - Rep* r = rep_; - assert(!r->closed); - r->closed = true; -} - -uint64_t TableBuilder::NumEntries() const { - return rep_->num_entries; -} - -uint64_t TableBuilder::FileSize() const { - return rep_->offset; -} - -} diff --git a/leveldb/table/table_test.cc b/leveldb/table/table_test.cc deleted file mode 100644 index 4b3e85e..0000000 --- a/leveldb/table/table_test.cc +++ /dev/null @@ -1,841 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/table.h" - -#include -#include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "leveldb/table_builder.h" -#include "table/block.h" -#include "table/block_builder.h" -#include "table/format.h" -#include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -// Return reverse of "key". -// Used to test non-lexicographic comparators. -static std::string Reverse(const Slice& key) { - std::string str(key.ToString()); - std::string rev(str.rbegin(), str.rend()); - return rev; -} - -namespace { -class ReverseKeyComparator : public Comparator { - public: - virtual const char* Name() const { - return "leveldb.ReverseBytewiseComparator"; - } - - virtual int Compare(const Slice& a, const Slice& b) const { - return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); - } - - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const { - std::string s = Reverse(*start); - std::string l = Reverse(limit); - BytewiseComparator()->FindShortestSeparator(&s, l); - *start = Reverse(s); - } - - virtual void FindShortSuccessor(std::string* key) const { - std::string s = Reverse(*key); - BytewiseComparator()->FindShortSuccessor(&s); - *key = Reverse(s); - } -}; -} -static ReverseKeyComparator reverse_key_comparator; - -static void Increment(const Comparator* cmp, std::string* key) { - if (cmp == BytewiseComparator()) { - key->push_back('\0'); - } else { - assert(cmp == &reverse_key_comparator); - std::string rev = Reverse(*key); - rev.push_back('\0'); - *key = Reverse(rev); - } -} - -// An STL comparator that uses a Comparator -namespace { -struct STLLessThan { - const Comparator* cmp; - - STLLessThan() : cmp(BytewiseComparator()) { } - STLLessThan(const Comparator* c) : cmp(c) { } - bool operator()(const std::string& a, const std::string& b) const { - return cmp->Compare(Slice(a), Slice(b)) < 0; - } -}; -} - -class StringSink: public WritableFile { - public: - ~StringSink() { } - - const std::string& contents() const { return contents_; } - - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } - - virtual Status Append(const Slice& data) { - contents_.append(data.data(), data.size()); - return Status::OK(); - } - - private: - std::string contents_; -}; - - -class StringSource: public RandomAccessFile { - public: - StringSource(const Slice& contents) - : contents_(contents.data(), contents.size()) { - } - - virtual ~StringSource() { } - - uint64_t Size() const { return contents_.size(); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - if (offset > contents_.size()) { - return Status::InvalidArgument("invalid Read offset"); - } - if (offset + n > contents_.size()) { - n = contents_.size() - offset; - } - memcpy(scratch, &contents_[offset], n); - *result = Slice(scratch, n); - return Status::OK(); - } - - private: - std::string contents_; -}; - -typedef std::map KVMap; - -// Helper class for tests to unify the interface between -// BlockBuilder/TableBuilder and Block/Table. -class Constructor { - public: - explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } - virtual ~Constructor() { } - - void Add(const std::string& key, const Slice& value) { - data_[key] = value.ToString(); - } - - // Finish constructing the data structure with all the keys that have - // been added so far. Returns the keys in sorted order in "*keys" - // and stores the key/value pairs in "*kvmap" - void Finish(const Options& options, - std::vector* keys, - KVMap* kvmap) { - *kvmap = data_; - keys->clear(); - for (KVMap::const_iterator it = data_.begin(); - it != data_.end(); - ++it) { - keys->push_back(it->first); - } - data_.clear(); - Status s = FinishImpl(options, *kvmap); - ASSERT_TRUE(s.ok()) << s.ToString(); - } - - // Construct the data structure from the data in "data" - virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; - - virtual size_t NumBytes() const = 0; - - virtual Iterator* NewIterator() const = 0; - - virtual const KVMap& data() { return data_; } - - virtual DB* db() const { return NULL; } // Overridden in DBConstructor - - private: - KVMap data_; -}; - -class BlockConstructor: public Constructor { - public: - explicit BlockConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp), - block_size_(-1), - block_(NULL) { } - ~BlockConstructor() { - delete block_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete block_; - block_ = NULL; - BlockBuilder builder(&options); - - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - builder.Add(it->first, it->second); - } - // Open the block - Slice block_data = builder.Finish(); - block_size_ = block_data.size(); - char* block_data_copy = new char[block_size_]; - memcpy(block_data_copy, block_data.data(), block_size_); - block_ = new Block(block_data_copy, block_size_); - return Status::OK(); - } - virtual size_t NumBytes() const { return block_size_; } - - virtual Iterator* NewIterator() const { - return block_->NewIterator(comparator_); - } - - private: - const Comparator* comparator_; - int block_size_; - Block* block_; - - BlockConstructor(); -}; - -class TableConstructor: public Constructor { - public: - TableConstructor(const Comparator* cmp) - : Constructor(cmp), - source_(NULL), table_(NULL) { - } - ~TableConstructor() { - Reset(); - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - Reset(); - StringSink sink; - TableBuilder builder(options, &sink); - - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - builder.Add(it->first, it->second); - ASSERT_TRUE(builder.status().ok()); - } - Status s = builder.Finish(); - ASSERT_TRUE(s.ok()) << s.ToString(); - - ASSERT_EQ(sink.contents().size(), builder.FileSize()); - - // Open the table - source_ = new StringSource(sink.contents()); - Options table_options; - table_options.comparator = options.comparator; - return Table::Open(table_options, source_, sink.contents().size(), &table_); - } - virtual size_t NumBytes() const { return source_->Size(); } - - virtual Iterator* NewIterator() const { - return table_->NewIterator(ReadOptions()); - } - - uint64_t ApproximateOffsetOf(const Slice& key) const { - return table_->ApproximateOffsetOf(key); - } - - private: - void Reset() { - delete table_; - delete source_; - table_ = NULL; - source_ = NULL; - } - - StringSource* source_; - Table* table_; - - TableConstructor(); -}; - -// A helper class that converts internal format keys into user keys -class KeyConvertingIterator: public Iterator { - public: - explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } - virtual ~KeyConvertingIterator() { delete iter_; } - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& target) { - ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); - std::string encoded; - AppendInternalKey(&encoded, ikey); - iter_->Seek(encoded); - } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } - - virtual Slice key() const { - assert(Valid()); - ParsedInternalKey key; - if (!ParseInternalKey(iter_->key(), &key)) { - status_ = Status::Corruption("malformed internal key"); - return Slice("corrupted key"); - } - return key.user_key; - } - - virtual Slice value() const { return iter_->value(); } - virtual Status status() const { - return status_.ok() ? iter_->status() : status_; - } - - private: - mutable Status status_; - Iterator* iter_; - - // No copying allowed - KeyConvertingIterator(const KeyConvertingIterator&); - void operator=(const KeyConvertingIterator&); -}; - -class MemTableConstructor: public Constructor { - public: - explicit MemTableConstructor(const Comparator* cmp) - : Constructor(cmp), - internal_comparator_(cmp) { - memtable_ = new MemTable(internal_comparator_); - } - ~MemTableConstructor() { - delete memtable_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete memtable_; - memtable_ = new MemTable(internal_comparator_); - int seq = 1; - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - memtable_->Add(seq, kTypeValue, it->first, it->second); - seq++; - } - return Status::OK(); - } - virtual size_t NumBytes() const { - return memtable_->ApproximateMemoryUsage(); - } - - virtual Iterator* NewIterator() const { - return new KeyConvertingIterator(memtable_->NewIterator()); - } - - private: - InternalKeyComparator internal_comparator_; - MemTable* memtable_; -}; - -class DBConstructor: public Constructor { - public: - explicit DBConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp) { - db_ = NULL; - NewDB(); - } - ~DBConstructor() { - delete db_; - } - virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete db_; - db_ = NULL; - NewDB(); - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - WriteBatch batch; - batch.Put(it->first, it->second); - ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); - } - return Status::OK(); - } - virtual size_t NumBytes() const { - Range r("", "\xff\xff"); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - - virtual Iterator* NewIterator() const { - return db_->NewIterator(ReadOptions()); - } - - virtual DB* db() const { return db_; } - - private: - void NewDB() { - std::string name = test::TmpDir() + "/table_testdb"; - - Options options; - options.comparator = comparator_; - Status status = DestroyDB(name, options); - ASSERT_TRUE(status.ok()) << status.ToString(); - - options.create_if_missing = true; - options.error_if_exists = true; - options.write_buffer_size = 10000; // Something small to force merging - status = DB::Open(options, name, &db_); - ASSERT_TRUE(status.ok()) << status.ToString(); - } - - const Comparator* comparator_; - DB* db_; -}; - -enum TestType { - TABLE_TEST, - BLOCK_TEST, - MEMTABLE_TEST, - DB_TEST, -}; - -struct TestArgs { - TestType type; - bool reverse_compare; - int restart_interval; -}; - -static const TestArgs kTestArgList[] = { - { TABLE_TEST, false, 16 }, - { TABLE_TEST, false, 1 }, - { TABLE_TEST, false, 1024 }, - { TABLE_TEST, true, 16 }, - { TABLE_TEST, true, 1 }, - { TABLE_TEST, true, 1024 }, - - { BLOCK_TEST, false, 16 }, - { BLOCK_TEST, false, 1 }, - { BLOCK_TEST, false, 1024 }, - { BLOCK_TEST, true, 16 }, - { BLOCK_TEST, true, 1 }, - { BLOCK_TEST, true, 1024 }, - - // Restart interval does not matter for memtables - { MEMTABLE_TEST, false, 16 }, - { MEMTABLE_TEST, true, 16 }, - - // Do not bother with restart interval variations for DB - { DB_TEST, false, 16 }, - { DB_TEST, true, 16 }, -}; -static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); - -class Harness { - public: - Harness() : constructor_(NULL) { } - - void Init(const TestArgs& args) { - delete constructor_; - constructor_ = NULL; - options_ = Options(); - - options_.block_restart_interval = args.restart_interval; - // Use shorter block size for tests to exercise block boundary - // conditions more. - options_.block_size = 256; - if (args.reverse_compare) { - options_.comparator = &reverse_key_comparator; - } - switch (args.type) { - case TABLE_TEST: - constructor_ = new TableConstructor(options_.comparator); - break; - case BLOCK_TEST: - constructor_ = new BlockConstructor(options_.comparator); - break; - case MEMTABLE_TEST: - constructor_ = new MemTableConstructor(options_.comparator); - break; - case DB_TEST: - constructor_ = new DBConstructor(options_.comparator); - break; - } - } - - ~Harness() { - delete constructor_; - } - - void Add(const std::string& key, const std::string& value) { - constructor_->Add(key, value); - } - - void Test(Random* rnd) { - std::vector keys; - KVMap data; - constructor_->Finish(options_, &keys, &data); - - TestForwardScan(keys, data); - TestBackwardScan(keys, data); - TestRandomAccess(rnd, keys, data); - } - - void TestForwardScan(const std::vector& keys, - const KVMap& data) { - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - iter->SeekToFirst(); - for (KVMap::const_iterator model_iter = data.begin(); - model_iter != data.end(); - ++model_iter) { - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - iter->Next(); - } - ASSERT_TRUE(!iter->Valid()); - delete iter; - } - - void TestBackwardScan(const std::vector& keys, - const KVMap& data) { - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - iter->SeekToLast(); - for (KVMap::const_reverse_iterator model_iter = data.rbegin(); - model_iter != data.rend(); - ++model_iter) { - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - iter->Prev(); - } - ASSERT_TRUE(!iter->Valid()); - delete iter; - } - - void TestRandomAccess(Random* rnd, - const std::vector& keys, - const KVMap& data) { - static const bool kVerbose = false; - Iterator* iter = constructor_->NewIterator(); - ASSERT_TRUE(!iter->Valid()); - KVMap::const_iterator model_iter = data.begin(); - if (kVerbose) fprintf(stderr, "---\n"); - for (int i = 0; i < 200; i++) { - const int toss = rnd->Uniform(5); - switch (toss) { - case 0: { - if (iter->Valid()) { - if (kVerbose) fprintf(stderr, "Next\n"); - iter->Next(); - ++model_iter; - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - } - break; - } - - case 1: { - if (kVerbose) fprintf(stderr, "SeekToFirst\n"); - iter->SeekToFirst(); - model_iter = data.begin(); - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - - case 2: { - std::string key = PickRandomKey(rnd, keys); - model_iter = data.lower_bound(key); - if (kVerbose) fprintf(stderr, "Seek '%s'\n", - EscapeString(key).c_str()); - iter->Seek(Slice(key)); - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - - case 3: { - if (iter->Valid()) { - if (kVerbose) fprintf(stderr, "Prev\n"); - iter->Prev(); - if (model_iter == data.begin()) { - model_iter = data.end(); // Wrap around to invalid value - } else { - --model_iter; - } - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - } - break; - } - - case 4: { - if (kVerbose) fprintf(stderr, "SeekToLast\n"); - iter->SeekToLast(); - if (keys.empty()) { - model_iter = data.end(); - } else { - std::string last = data.rbegin()->first; - model_iter = data.lower_bound(last); - } - ASSERT_EQ(ToString(data, model_iter), ToString(iter)); - break; - } - } - } - delete iter; - } - - std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { - if (it == data.end()) { - return "END"; - } else { - return "'" + it->first + "->" + it->second + "'"; - } - } - - std::string ToString(const KVMap& data, - const KVMap::const_reverse_iterator& it) { - if (it == data.rend()) { - return "END"; - } else { - return "'" + it->first + "->" + it->second + "'"; - } - } - - std::string ToString(const Iterator* it) { - if (!it->Valid()) { - return "END"; - } else { - return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; - } - } - - std::string PickRandomKey(Random* rnd, const std::vector& keys) { - if (keys.empty()) { - return "foo"; - } else { - const int index = rnd->Uniform(keys.size()); - std::string result = keys[index]; - switch (rnd->Uniform(3)) { - case 0: - // Return an existing key - break; - case 1: { - // Attempt to return something smaller than an existing key - if (result.size() > 0 && result[result.size()-1] > '\0') { - result[result.size()-1]--; - } - break; - } - case 2: { - // Return something larger than an existing key - Increment(options_.comparator, &result); - break; - } - } - return result; - } - } - - // Returns NULL if not running against a DB - DB* db() const { return constructor_->db(); } - - private: - Options options_; - Constructor* constructor_; -}; - -// Test the empty key -TEST(Harness, SimpleEmptyKey) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 1); - Add("", "v"); - Test(&rnd); - } -} - -TEST(Harness, SimpleSingle) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 2); - Add("abc", "v"); - Test(&rnd); - } -} - -TEST(Harness, SimpleMulti) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 3); - Add("abc", "v"); - Add("abcd", "v"); - Add("ac", "v2"); - Test(&rnd); - } -} - -TEST(Harness, SimpleSpecialKey) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 4); - Add("\xff\xff", "v3"); - Test(&rnd); - } -} - -TEST(Harness, Randomized) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 5); - for (int num_entries = 0; num_entries < 2000; - num_entries += (num_entries < 50 ? 1 : 200)) { - if ((num_entries % 10) == 0) { - fprintf(stderr, "case %d of %d: num_entries = %d\n", - (i + 1), int(kNumTestArgs), num_entries); - } - for (int e = 0; e < num_entries; e++) { - std::string v; - Add(test::RandomKey(&rnd, rnd.Skewed(4)), - test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); - } - Test(&rnd); - } - } -} - -TEST(Harness, RandomizedLongDB) { - Random rnd(test::RandomSeed()); - TestArgs args = { DB_TEST, false, 16 }; - Init(args); - int num_entries = 100000; - for (int e = 0; e < num_entries; e++) { - std::string v; - Add(test::RandomKey(&rnd, rnd.Skewed(4)), - test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); - } - Test(&rnd); - - // We must have created enough data to force merging - std::string l0_files, l1_files; - ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); - ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); - ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0); - -} - -class MemTableTest { }; - -TEST(MemTableTest, Simple) { - InternalKeyComparator cmp(BytewiseComparator()); - MemTable memtable(cmp); - WriteBatch batch; - WriteBatchInternal::SetSequence(&batch, 100); - batch.Put(std::string("k1"), std::string("v1")); - batch.Put(std::string("k2"), std::string("v2")); - batch.Put(std::string("k3"), std::string("v3")); - batch.Put(std::string("largekey"), std::string("vlarge")); - ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); - - Iterator* iter = memtable.NewIterator(); - iter->SeekToFirst(); - while (iter->Valid()) { - fprintf(stderr, "key: '%s' -> '%s'\n", - iter->key().ToString().c_str(), - iter->value().ToString().c_str()); - iter->Next(); - } - - delete iter; -} - -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; -} - -class TableTest { }; - -TEST(TableTest, ApproximateOffsetOfPlain) { - TableConstructor c(BytewiseComparator()); - c.Add("k01", "hello"); - c.Add("k02", "hello2"); - c.Add("k03", std::string(10000, 'x')); - c.Add("k04", std::string(200000, 'x')); - c.Add("k05", std::string(300000, 'x')); - c.Add("k06", "hello3"); - c.Add("k07", std::string(100000, 'x')); - std::vector keys; - KVMap kvmap; - Options options; - options.block_size = 1024; - options.compression = kNoCompression; - c.Finish(options, &keys, &kvmap); - - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); - -} - -static bool SnappyCompressionSupported() { - std::string out; - Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Snappy_Compress(in.data(), in.size(), &out); -} - -TEST(TableTest, ApproximateOffsetOfCompressed) { - if (!SnappyCompressionSupported()) { - fprintf(stderr, "skipping compression tests\n"); - return; - } - - Random rnd(301); - TableConstructor c(BytewiseComparator()); - std::string tmp; - c.Add("k01", "hello"); - c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); - c.Add("k03", "hello3"); - c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); - std::vector keys; - KVMap kvmap; - Options options; - options.block_size = 1024; - options.compression = kSnappyCompression; - c.Finish(options, &keys, &kvmap); - - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/table/two_level_iterator.cc b/leveldb/table/two_level_iterator.cc deleted file mode 100644 index 24a1241..0000000 --- a/leveldb/table/two_level_iterator.cc +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/two_level_iterator.h" - -#include "leveldb/table.h" -#include "table/block.h" -#include "table/format.h" -#include "table/iterator_wrapper.h" - -namespace leveldb { - -namespace { - -typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); - -class TwoLevelIterator: public Iterator { - public: - TwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options); - - virtual ~TwoLevelIterator(); - - virtual void Seek(const Slice& target); - virtual void SeekToFirst(); - virtual void SeekToLast(); - virtual void Next(); - virtual void Prev(); - - virtual bool Valid() const { - return data_iter_.Valid(); - } - virtual Slice key() const { - assert(Valid()); - return data_iter_.key(); - } - virtual Slice value() const { - assert(Valid()); - return data_iter_.value(); - } - virtual Status status() const { - // It'd be nice if status() returned a const Status& instead of a Status - if (!index_iter_.status().ok()) { - return index_iter_.status(); - } else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { - return data_iter_.status(); - } else { - return status_; - } - } - - private: - void SaveError(const Status& s) { - if (status_.ok() && !s.ok()) status_ = s; - } - void SkipEmptyDataBlocksForward(); - void SkipEmptyDataBlocksBackward(); - void SetDataIterator(Iterator* data_iter); - void InitDataBlock(); - - BlockFunction block_function_; - void* arg_; - const ReadOptions options_; - Status status_; - IteratorWrapper index_iter_; - IteratorWrapper data_iter_; // May be NULL - // If data_iter_ is non-NULL, then "data_block_handle_" holds the - // "index_value" passed to block_function_ to create the data_iter_. - std::string data_block_handle_; -}; - -TwoLevelIterator::TwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options) - : block_function_(block_function), - arg_(arg), - options_(options), - index_iter_(index_iter), - data_iter_(NULL) { -} - -TwoLevelIterator::~TwoLevelIterator() { -} - -void TwoLevelIterator::Seek(const Slice& target) { - index_iter_.Seek(target); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.Seek(target); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::SeekToFirst() { - index_iter_.SeekToFirst(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::SeekToLast() { - index_iter_.SeekToLast(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); - SkipEmptyDataBlocksBackward(); -} - -void TwoLevelIterator::Next() { - assert(Valid()); - data_iter_.Next(); - SkipEmptyDataBlocksForward(); -} - -void TwoLevelIterator::Prev() { - assert(Valid()); - data_iter_.Prev(); - SkipEmptyDataBlocksBackward(); -} - - -void TwoLevelIterator::SkipEmptyDataBlocksForward() { - while (data_iter_.iter() == NULL || !data_iter_.Valid()) { - // Move to next block - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - return; - } - index_iter_.Next(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); - } -} - -void TwoLevelIterator::SkipEmptyDataBlocksBackward() { - while (data_iter_.iter() == NULL || !data_iter_.Valid()) { - // Move to next block - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - return; - } - index_iter_.Prev(); - InitDataBlock(); - if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); - } -} - -void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { - if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); - data_iter_.Set(data_iter); -} - -void TwoLevelIterator::InitDataBlock() { - if (!index_iter_.Valid()) { - SetDataIterator(NULL); - } else { - Slice handle = index_iter_.value(); - if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { - // data_iter_ is already constructed with this iterator, so - // no need to change anything - } else { - Iterator* iter = (*block_function_)(arg_, options_, handle); - data_block_handle_.assign(handle.data(), handle.size()); - SetDataIterator(iter); - } - } -} - -} - -Iterator* NewTwoLevelIterator( - Iterator* index_iter, - BlockFunction block_function, - void* arg, - const ReadOptions& options) { - return new TwoLevelIterator(index_iter, block_function, arg, options); -} - -} diff --git a/leveldb/table/two_level_iterator.h b/leveldb/table/two_level_iterator.h deleted file mode 100644 index 5909e2b..0000000 --- a/leveldb/table/two_level_iterator.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ -#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ - -#include "leveldb/iterator.h" - -namespace leveldb { - -struct ReadOptions; - -// Return a new two level iterator. A two-level iterator contains an -// index iterator whose values point to a sequence of blocks where -// each block is itself a sequence of key,value pairs. The returned -// two-level iterator yields the concatenation of all key/value pairs -// in the sequence of blocks. Takes ownership of "index_iter" and -// will delete it when no longer needed. -// -// Uses a supplied function to convert an index_iter value into -// an iterator over the contents of the corresponding block. -extern Iterator* NewTwoLevelIterator( - Iterator* index_iter, - Iterator* (*block_function)( - void* arg, - const ReadOptions& options, - const Slice& index_value), - void* arg, - const ReadOptions& options); - -} - -#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ diff --git a/leveldb/util/arena.cc b/leveldb/util/arena.cc deleted file mode 100644 index 40ab99d..0000000 --- a/leveldb/util/arena.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/arena.h" -#include - -namespace leveldb { - -static const int kBlockSize = 4096; - -Arena::Arena() { - blocks_memory_ = 0; - alloc_ptr_ = NULL; // First allocation will allocate a block - alloc_bytes_remaining_ = 0; -} - -Arena::~Arena() { - for (size_t i = 0; i < blocks_.size(); i++) { - delete[] blocks_[i]; - } -} - -char* Arena::AllocateFallback(size_t bytes) { - if (bytes > kBlockSize / 4) { - // Object is more than a quarter of our block size. Allocate it separately - // to avoid wasting too much space in leftover bytes. - char* result = AllocateNewBlock(bytes); - return result; - } - - // We waste the remaining space in the current block. - alloc_ptr_ = AllocateNewBlock(kBlockSize); - alloc_bytes_remaining_ = kBlockSize; - - char* result = alloc_ptr_; - alloc_ptr_ += bytes; - alloc_bytes_remaining_ -= bytes; - return result; -} - -char* Arena::AllocateAligned(size_t bytes) { - const int align = sizeof(void*); // We'll align to pointer size - assert((align & (align-1)) == 0); // Pointer size should be a power of 2 - size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); - size_t slop = (current_mod == 0 ? 0 : align - current_mod); - size_t needed = bytes + slop; - char* result; - if (needed <= alloc_bytes_remaining_) { - result = alloc_ptr_ + slop; - alloc_ptr_ += needed; - alloc_bytes_remaining_ -= needed; - } else { - // AllocateFallback always returned aligned memory - result = AllocateFallback(bytes); - } - assert((reinterpret_cast(result) & (align-1)) == 0); - return result; -} - -char* Arena::AllocateNewBlock(size_t block_bytes) { - char* result = new char[block_bytes]; - blocks_memory_ += block_bytes; - blocks_.push_back(result); - return result; -} - -} diff --git a/leveldb/util/arena.h b/leveldb/util/arena.h deleted file mode 100644 index fcb5d5b..0000000 --- a/leveldb/util/arena.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ -#define STORAGE_LEVELDB_UTIL_ARENA_H_ - -#include -#include -#include -#include - -namespace leveldb { - -class Arena { - public: - Arena(); - ~Arena(); - - // Return a pointer to a newly allocated memory block of "bytes" bytes. - char* Allocate(size_t bytes); - - // Allocate memory with the normal alignment guarantees provided by malloc - char* AllocateAligned(size_t bytes); - - // Returns an estimate of the total memory usage of data allocated - // by the arena (including space allocated but not yet used for user - // allocations). - size_t MemoryUsage() const { - return blocks_memory_ + blocks_.capacity() * sizeof(char*); - } - - private: - char* AllocateFallback(size_t bytes); - char* AllocateNewBlock(size_t block_bytes); - - // Allocation state - char* alloc_ptr_; - size_t alloc_bytes_remaining_; - - // Array of new[] allocated memory blocks - std::vector blocks_; - - // Bytes of memory in blocks allocated so far - size_t blocks_memory_; - - // No copying allowed - Arena(const Arena&); - void operator=(const Arena&); -}; - -inline char* Arena::Allocate(size_t bytes) { - // The semantics of what to return are a bit messy if we allow - // 0-byte allocations, so we disallow them here (we don't need - // them for our internal use). - assert(bytes > 0); - if (bytes <= alloc_bytes_remaining_) { - char* result = alloc_ptr_; - alloc_ptr_ += bytes; - alloc_bytes_remaining_ -= bytes; - return result; - } - return AllocateFallback(bytes); -} - -} - -#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/leveldb/util/arena_test.cc b/leveldb/util/arena_test.cc deleted file mode 100644 index c33b552..0000000 --- a/leveldb/util/arena_test.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/arena.h" - -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { - -class ArenaTest { }; - -TEST(ArenaTest, Empty) { - Arena arena; -} - -TEST(ArenaTest, Simple) { - std::vector > allocated; - Arena arena; - const int N = 100000; - size_t bytes = 0; - Random rnd(301); - for (int i = 0; i < N; i++) { - size_t s; - if (i % (N / 10) == 0) { - s = i; - } else { - s = rnd.OneIn(4000) ? rnd.Uniform(6000) : - (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); - } - if (s == 0) { - // Our arena disallows size 0 allocations. - s = 1; - } - char* r; - if (rnd.OneIn(10)) { - r = arena.AllocateAligned(s); - } else { - r = arena.Allocate(s); - } - - for (int b = 0; b < s; b++) { - // Fill the "i"th allocation with a known bit pattern - r[b] = i % 256; - } - bytes += s; - allocated.push_back(std::make_pair(s, r)); - ASSERT_GE(arena.MemoryUsage(), bytes); - if (i > N/10) { - ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); - } - } - for (int i = 0; i < allocated.size(); i++) { - size_t num_bytes = allocated[i].first; - const char* p = allocated[i].second; - for (int b = 0; b < num_bytes; b++) { - // Check the "i"th allocation for the known bit pattern - ASSERT_EQ(int(p[b]) & 0xff, i % 256); - } - } -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/cache.cc b/leveldb/util/cache.cc deleted file mode 100644 index d8a4426..0000000 --- a/leveldb/util/cache.cc +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) -#include -#elif defined(LEVELDB_PLATFORM_CHROMIUM) -#include "base/hash_tables.h" -#else -#include // TODO(sanjay): Switch to unordered_set when possible. -#endif - -#include - -#include "leveldb/cache.h" -#include "port/port.h" -#include "util/hash.h" -#include "util/mutexlock.h" - -namespace leveldb { - -Cache::~Cache() { -} - -namespace { - -// LRU cache implementation - -// An entry is a variable length heap-allocated structure. Entries -// are kept in a circular doubly linked list ordered by access time. -struct LRUHandle { - void* value; - void (*deleter)(const Slice&, void* value); - LRUHandle* next; - LRUHandle* prev; - size_t charge; // TODO(opt): Only allow uint32_t? - size_t key_length; - size_t refs; // TODO(opt): Pack with "key_length"? - char key_data[1]; // Beginning of key - - Slice key() const { - // For cheaper lookups, we allow a temporary Handle object - // to store a pointer to a key in "value". - if (next == this) { - return *(reinterpret_cast(value)); - } else { - return Slice(key_data, key_length); - } - } -}; - -// Pick a platform specific hash_set instantiation -#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) - // Microsoft's hash_set deviates from the standard. See - // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx - // for details. Basically the 2 param () operator is a less than and - // the 1 param () operator is a hash function. - struct HandleHashCompare : public stdext::hash_compare { - size_t operator() (LRUHandle* h) const { - Slice k = h->key(); - return Hash(k.data(), k.size(), 0); - } - bool operator() (LRUHandle* a, LRUHandle* b) const { - return a->key().compare(b->key()) < 0; - } - }; - typedef base::hash_set HandleTable; -#else - struct HandleHash { - inline size_t operator()(LRUHandle* h) const { - Slice k = h->key(); - return Hash(k.data(), k.size(), 0); - } - }; - - struct HandleEq { - inline bool operator()(LRUHandle* a, LRUHandle* b) const { - return a->key() == b->key(); - } - }; -# if defined(LEVELDB_PLATFORM_CHROMIUM) - typedef base::hash_set HandleTable; -# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) - typedef std::unordered_set HandleTable; -# else - typedef __gnu_cxx::hash_set HandleTable; -# endif -#endif - -class LRUCache : public Cache { - public: - explicit LRUCache(size_t capacity); - virtual ~LRUCache(); - - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)); - virtual Handle* Lookup(const Slice& key); - virtual void Release(Handle* handle); - virtual void* Value(Handle* handle); - virtual void Erase(const Slice& key); - virtual uint64_t NewId(); - - private: - void LRU_Remove(LRUHandle* e); - void LRU_Append(LRUHandle* e); - void Unref(LRUHandle* e); - - // Constructor parameters - const size_t capacity_; - - // mutex_ protects the following state. - port::Mutex mutex_; - size_t usage_; - uint64_t last_id_; - - // Dummy head of LRU list. - // lru.prev is newest entry, lru.next is oldest entry. - LRUHandle lru_; - - HandleTable table_; -}; - -LRUCache::LRUCache(size_t capacity) - : capacity_(capacity), - usage_(0), - last_id_(0) { - // Make empty circular linked list - lru_.next = &lru_; - lru_.prev = &lru_; -} - -LRUCache::~LRUCache() { - table_.clear(); - for (LRUHandle* e = lru_.next; e != &lru_; ) { - LRUHandle* next = e->next; - assert(e->refs == 1); // Error if caller has an unreleased handle - Unref(e); - e = next; - } -} - -void LRUCache::Unref(LRUHandle* e) { - assert(e->refs > 0); - e->refs--; - if (e->refs <= 0) { - usage_ -= e->charge; - (*e->deleter)(e->key(), e->value); - free(e); - } -} - -void LRUCache::LRU_Remove(LRUHandle* e) { - e->next->prev = e->prev; - e->prev->next = e->next; -} - -void LRUCache::LRU_Append(LRUHandle* e) { - // Make "e" newest entry by inserting just before lru_ - e->next = &lru_; - e->prev = lru_.prev; - e->prev->next = e; - e->next->prev = e; -} - -Cache::Handle* LRUCache::Lookup(const Slice& key) { - MutexLock l(&mutex_); - - LRUHandle dummy; - dummy.next = &dummy; - dummy.value = const_cast(&key); - HandleTable::iterator iter = table_.find(&dummy); - if (iter == table_.end()) { - return NULL; - } else { - LRUHandle* e = const_cast(*iter); - e->refs++; - LRU_Remove(e); - LRU_Append(e); - return reinterpret_cast(e); - } -} - -void* LRUCache::Value(Handle* handle) { - return reinterpret_cast(handle)->value; -} - -void LRUCache::Release(Handle* handle) { - MutexLock l(&mutex_); - Unref(reinterpret_cast(handle)); -} - -Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) { - MutexLock l(&mutex_); - - LRUHandle* e = reinterpret_cast( - malloc(sizeof(LRUHandle)-1 + key.size())); - e->value = value; - e->deleter = deleter; - e->charge = charge; - e->key_length = key.size(); - e->refs = 2; // One from LRUCache, one for the returned handle - memcpy(e->key_data, key.data(), key.size()); - LRU_Append(e); - usage_ += charge; - - std::pair p = table_.insert(e); - if (!p.second) { - // Kill existing entry - LRUHandle* old = const_cast(*(p.first)); - LRU_Remove(old); - table_.erase(p.first); - table_.insert(e); - Unref(old); - } - - while (usage_ > capacity_ && lru_.next != &lru_) { - LRUHandle* old = lru_.next; - LRU_Remove(old); - table_.erase(old); - Unref(old); - } - - return reinterpret_cast(e); -} - -void LRUCache::Erase(const Slice& key) { - MutexLock l(&mutex_); - - LRUHandle dummy; - dummy.next = &dummy; - dummy.value = const_cast(&key); - HandleTable::iterator iter = table_.find(&dummy); - if (iter != table_.end()) { - LRUHandle* e = const_cast(*iter); - LRU_Remove(e); - table_.erase(iter); - Unref(e); - } -} - -uint64_t LRUCache::NewId() { - MutexLock l(&mutex_); - return ++(last_id_); -} - -} // end anonymous namespace - -Cache* NewLRUCache(size_t capacity) { - return new LRUCache(capacity); -} - -} diff --git a/leveldb/util/cache_test.cc b/leveldb/util/cache_test.cc deleted file mode 100644 index dbab988..0000000 --- a/leveldb/util/cache_test.cc +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/cache.h" - -#include -#include "util/coding.h" -#include "util/testharness.h" - -namespace leveldb { - -// Conversions between numeric keys/values and the types expected by Cache. -static std::string EncodeKey(int k) { - std::string result; - PutFixed32(&result, k); - return result; -} -static int DecodeKey(const Slice& k) { - assert(k.size() == 4); - return DecodeFixed32(k.data()); -} -static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } -static int DecodeValue(void* v) { return reinterpret_cast(v); } - -class CacheTest { - public: - static CacheTest* current_; - - static void Deleter(const Slice& key, void* v) { - current_->deleted_keys_.push_back(DecodeKey(key)); - current_->deleted_values_.push_back(DecodeValue(v)); - } - - static const int kCacheSize = 100; - std::vector deleted_keys_; - std::vector deleted_values_; - Cache* cache_; - - CacheTest() : cache_(NewLRUCache(kCacheSize)) { - current_ = this; - } - - ~CacheTest() { - delete cache_; - } - - int Lookup(int key) { - Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); - const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); - if (handle != NULL) { - cache_->Release(handle); - } - return r; - } - - void Insert(int key, int value, int charge = 1) { - cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, - &CacheTest::Deleter)); - } - - void Erase(int key) { - cache_->Erase(EncodeKey(key)); - } -}; -CacheTest* CacheTest::current_; - -TEST(CacheTest, HitAndMiss) { - ASSERT_EQ(-1, Lookup(100)); - - Insert(100, 101); - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(-1, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - Insert(200, 201); - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - Insert(100, 102); - ASSERT_EQ(102, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); - - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); -} - -TEST(CacheTest, Erase) { - Erase(200); - ASSERT_EQ(0, deleted_keys_.size()); - - Insert(100, 101); - Insert(200, 201); - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); - - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(1, deleted_keys_.size()); -} - -TEST(CacheTest, EntriesArePinned) { - Insert(100, 101); - Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); - ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); - - Insert(100, 102); - Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); - ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); - ASSERT_EQ(0, deleted_keys_.size()); - - cache_->Release(h1); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); - - Erase(100); - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(1, deleted_keys_.size()); - - cache_->Release(h2); - ASSERT_EQ(2, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[1]); - ASSERT_EQ(102, deleted_values_[1]); -} - -TEST(CacheTest, EvictionPolicy) { - Insert(100, 101); - Insert(200, 201); - - // Frequently used entry must be kept around - for (int i = 0; i < kCacheSize; i++) { - Insert(1000+i, 2000+i); - ASSERT_EQ(2000+i, Lookup(1000+i)); - ASSERT_EQ(101, Lookup(100)); - } - ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(2, deleted_keys_.size()); - ASSERT_EQ(200, deleted_keys_[0]); - ASSERT_EQ(201, deleted_values_[0]); -} - -TEST(CacheTest, HeavyEntry) { - Insert(100, 101); - Insert(200, 201, kCacheSize); - ASSERT_EQ(1, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); - ASSERT_EQ(101, deleted_values_[0]); -} - -TEST(CacheTest, NewId) { - uint64_t a = cache_->NewId(); - uint64_t b = cache_->NewId(); - ASSERT_NE(a, b); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/coding.cc b/leveldb/util/coding.cc deleted file mode 100644 index 14f21f7..0000000 --- a/leveldb/util/coding.cc +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/coding.h" - -namespace leveldb { - -void EncodeFixed32(char* buf, uint32_t value) { -#if __BYTE_ORDER == __LITTLE_ENDIAN - memcpy(buf, &value, sizeof(value)); -#else - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; -#endif -} - -void EncodeFixed64(char* buf, uint64_t value) { -#if __BYTE_ORDER == __LITTLE_ENDIAN - memcpy(buf, &value, sizeof(value)); -#else - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; - buf[4] = (value >> 32) & 0xff; - buf[5] = (value >> 40) & 0xff; - buf[6] = (value >> 48) & 0xff; - buf[7] = (value >> 56) & 0xff; -#endif -} - -void PutFixed32(std::string* dst, uint32_t value) { - char buf[sizeof(value)]; - EncodeFixed32(buf, value); - dst->append(buf, sizeof(buf)); -} - -void PutFixed64(std::string* dst, uint64_t value) { - char buf[sizeof(value)]; - EncodeFixed64(buf, value); - dst->append(buf, sizeof(buf)); -} - -char* EncodeVarint32(char* dst, uint32_t v) { - // Operate on characters as unsigneds - unsigned char* ptr = reinterpret_cast(dst); - static const int B = 128; - if (v < (1<<7)) { - *(ptr++) = v; - } else if (v < (1<<14)) { - *(ptr++) = v | B; - *(ptr++) = v>>7; - } else if (v < (1<<21)) { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = v>>14; - } else if (v < (1<<28)) { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = (v>>14) | B; - *(ptr++) = v>>21; - } else { - *(ptr++) = v | B; - *(ptr++) = (v>>7) | B; - *(ptr++) = (v>>14) | B; - *(ptr++) = (v>>21) | B; - *(ptr++) = v>>28; - } - return reinterpret_cast(ptr); -} - -void PutVarint32(std::string* dst, uint32_t v) { - char buf[5]; - char* ptr = EncodeVarint32(buf, v); - dst->append(buf, ptr - buf); -} - -char* EncodeVarint64(char* dst, uint64_t v) { - static const int B = 128; - unsigned char* ptr = reinterpret_cast(dst); - while (v >= B) { - *(ptr++) = (v & (B-1)) | B; - v >>= 7; - } - *(ptr++) = static_cast(v); - return reinterpret_cast(ptr); -} - -void PutVarint64(std::string* dst, uint64_t v) { - char buf[10]; - char* ptr = EncodeVarint64(buf, v); - dst->append(buf, ptr - buf); -} - -void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { - PutVarint32(dst, value.size()); - dst->append(value.data(), value.size()); -} - -int VarintLength(uint64_t v) { - int len = 1; - while (v >= 128) { - v >>= 7; - len++; - } - return len; -} - -const char* GetVarint32PtrFallback(const char* p, - const char* limit, - uint32_t* value) { - uint32_t result = 0; - for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { - uint32_t byte = *(reinterpret_cast(p)); - p++; - if (byte & 128) { - // More bytes are present - result |= ((byte & 127) << shift); - } else { - result |= (byte << shift); - *value = result; - return reinterpret_cast(p); - } - } - return NULL; -} - -bool GetVarint32(Slice* input, uint32_t* value) { - const char* p = input->data(); - const char* limit = p + input->size(); - const char* q = GetVarint32Ptr(p, limit, value); - if (q == NULL) { - return false; - } else { - *input = Slice(q, limit - q); - return true; - } -} - -const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { - uint64_t result = 0; - for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { - uint64_t byte = *(reinterpret_cast(p)); - p++; - if (byte & 128) { - // More bytes are present - result |= ((byte & 127) << shift); - } else { - result |= (byte << shift); - *value = result; - return reinterpret_cast(p); - } - } - return NULL; -} - -bool GetVarint64(Slice* input, uint64_t* value) { - const char* p = input->data(); - const char* limit = p + input->size(); - const char* q = GetVarint64Ptr(p, limit, value); - if (q == NULL) { - return false; - } else { - *input = Slice(q, limit - q); - return true; - } -} - -const char* GetLengthPrefixedSlice(const char* p, const char* limit, - Slice* result) { - uint32_t len; - p = GetVarint32Ptr(p, limit, &len); - if (p == NULL) return NULL; - if (p + len > limit) return NULL; - *result = Slice(p, len); - return p + len; -} - -bool GetLengthPrefixedSlice(Slice* input, Slice* result) { - uint32_t len; - if (GetVarint32(input, &len) && - input->size() >= len) { - *result = Slice(input->data(), len); - input->remove_prefix(len); - return true; - } else { - return false; - } -} - -} diff --git a/leveldb/util/coding.h b/leveldb/util/coding.h deleted file mode 100644 index 8755968..0000000 --- a/leveldb/util/coding.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Endian-neutral encoding: -// * Fixed-length numbers are encoded with least-significant byte first -// * In addition we support variable length "varint" encoding -// * Strings are encoded prefixed by their length in varint format - -#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ -#define STORAGE_LEVELDB_UTIL_CODING_H_ - -#include -#include -#include -#include "leveldb/slice.h" -#include "port/port.h" - -namespace leveldb { - -// Standard Put... routines append to a string -extern void PutFixed32(std::string* dst, uint32_t value); -extern void PutFixed64(std::string* dst, uint64_t value); -extern void PutVarint32(std::string* dst, uint32_t value); -extern void PutVarint64(std::string* dst, uint64_t value); -extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); - -// Standard Get... routines parse a value from the beginning of a Slice -// and advance the slice past the parsed value. -extern bool GetVarint32(Slice* input, uint32_t* value); -extern bool GetVarint64(Slice* input, uint64_t* value); -extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); - -// Pointer-based variants of GetVarint... These either store a value -// in *v and return a pointer just past the parsed value, or return -// NULL on error. These routines only look at bytes in the range -// [p..limit-1] -extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); -extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); - -// Returns the length of the varint32 or varint64 encoding of "v" -extern int VarintLength(uint64_t v); - -// Lower-level versions of Put... that write directly into a character buffer -// REQUIRES: dst has enough space for the value being written -extern void EncodeFixed32(char* dst, uint32_t value); -extern void EncodeFixed64(char* dst, uint64_t value); - -// Lower-level versions of Put... that write directly into a character buffer -// and return a pointer just past the last byte written. -// REQUIRES: dst has enough space for the value being written -extern char* EncodeVarint32(char* dst, uint32_t value); -extern char* EncodeVarint64(char* dst, uint64_t value); - -// Lower-level versions of Get... that read directly from a character buffer -// without any bounds checking. - -inline uint32_t DecodeFixed32(const char* ptr) { - if (port::kLittleEndian) { - // Load the raw bytes - uint32_t result; - memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load - return result; - } else { - return ((static_cast(ptr[0])) - | (static_cast(ptr[1]) << 8) - | (static_cast(ptr[2]) << 16) - | (static_cast(ptr[3]) << 24)); - } -} - -inline uint64_t DecodeFixed64(const char* ptr) { - if (port::kLittleEndian) { - // Load the raw bytes - uint64_t result; - memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load - return result; - } else { - uint64_t lo = DecodeFixed32(ptr); - uint64_t hi = DecodeFixed32(ptr + 4); - return (hi << 32) | lo; - } -} - -// Internal routine for use by fallback path of GetVarint32Ptr -extern const char* GetVarint32PtrFallback(const char* p, - const char* limit, - uint32_t* value); -inline const char* GetVarint32Ptr(const char* p, - const char* limit, - uint32_t* value) { - if (p < limit) { - uint32_t result = *(reinterpret_cast(p)); - if ((result & 128) == 0) { - *value = result; - return p + 1; - } - } - return GetVarint32PtrFallback(p, limit, value); -} - -} - -#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/leveldb/util/coding_test.cc b/leveldb/util/coding_test.cc deleted file mode 100644 index a8dba04..0000000 --- a/leveldb/util/coding_test.cc +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/coding.h" - -#include "util/testharness.h" - -namespace leveldb { - -class Coding { }; - -TEST(Coding, Fixed32) { - std::string s; - for (uint32_t v = 0; v < 100000; v++) { - PutFixed32(&s, v); - } - - const char* p = s.data(); - for (uint32_t v = 0; v < 100000; v++) { - uint32_t actual = DecodeFixed32(p); - ASSERT_EQ(v, actual); - p += sizeof(uint32_t); - } -} - -TEST(Coding, Fixed64) { - std::string s; - for (int power = 0; power <= 63; power++) { - uint64_t v = static_cast(1) << power; - PutFixed64(&s, v - 1); - PutFixed64(&s, v + 0); - PutFixed64(&s, v + 1); - } - - const char* p = s.data(); - for (int power = 0; power <= 63; power++) { - uint64_t v = static_cast(1) << power; - uint64_t actual; - actual = DecodeFixed64(p); - ASSERT_EQ(v-1, actual); - p += sizeof(uint64_t); - - actual = DecodeFixed64(p); - ASSERT_EQ(v+0, actual); - p += sizeof(uint64_t); - - actual = DecodeFixed64(p); - ASSERT_EQ(v+1, actual); - p += sizeof(uint64_t); - } -} - -TEST(Coding, Varint32) { - std::string s; - for (uint32_t i = 0; i < (32 * 32); i++) { - uint32_t v = (i / 32) << (i % 32); - PutVarint32(&s, v); - } - - const char* p = s.data(); - const char* limit = p + s.size(); - for (uint32_t i = 0; i < (32 * 32); i++) { - uint32_t expected = (i / 32) << (i % 32); - uint32_t actual; - const char* start = p; - p = GetVarint32Ptr(p, limit, &actual); - ASSERT_TRUE(p != NULL); - ASSERT_EQ(expected, actual); - ASSERT_EQ(VarintLength(actual), p - start); - } - ASSERT_EQ(p, s.data() + s.size()); -} - -TEST(Coding, Varint64) { - // Construct the list of values to check - std::vector values; - // Some special values - values.push_back(0); - values.push_back(100); - values.push_back(~static_cast(0)); - values.push_back(~static_cast(0) - 1); - for (uint32_t k = 0; k < 64; k++) { - // Test values near powers of two - const uint64_t power = 1ull << k; - values.push_back(power); - values.push_back(power-1); - values.push_back(power+1); - }; - - std::string s; - for (int i = 0; i < values.size(); i++) { - PutVarint64(&s, values[i]); - } - - const char* p = s.data(); - const char* limit = p + s.size(); - for (int i = 0; i < values.size(); i++) { - ASSERT_TRUE(p < limit); - uint64_t actual; - const char* start = p; - p = GetVarint64Ptr(p, limit, &actual); - ASSERT_TRUE(p != NULL); - ASSERT_EQ(values[i], actual); - ASSERT_EQ(VarintLength(actual), p - start); - } - ASSERT_EQ(p, limit); - -} - -TEST(Coding, Varint32Overflow) { - uint32_t result; - std::string input("\x81\x82\x83\x84\x85\x11"); - ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) - == NULL); -} - -TEST(Coding, Varint32Truncation) { - uint32_t large_value = (1u << 31) + 100; - std::string s; - PutVarint32(&s, large_value); - uint32_t result; - for (int len = 0; len < s.size() - 1; len++) { - ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); - } - ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); - ASSERT_EQ(large_value, result); -} - -TEST(Coding, Varint64Overflow) { - uint64_t result; - std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); - ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) - == NULL); -} - -TEST(Coding, Varint64Truncation) { - uint64_t large_value = (1ull << 63) + 100ull; - std::string s; - PutVarint64(&s, large_value); - uint64_t result; - for (int len = 0; len < s.size() - 1; len++) { - ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); - } - ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); - ASSERT_EQ(large_value, result); -} - -TEST(Coding, Strings) { - std::string s; - PutLengthPrefixedSlice(&s, Slice("")); - PutLengthPrefixedSlice(&s, Slice("foo")); - PutLengthPrefixedSlice(&s, Slice("bar")); - PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); - - Slice input(s); - Slice v; - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("foo", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ("bar", v.ToString()); - ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); - ASSERT_EQ(std::string(200, 'x'), v.ToString()); - ASSERT_EQ("", input.ToString()); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/comparator.cc b/leveldb/util/comparator.cc deleted file mode 100644 index cc2b263..0000000 --- a/leveldb/util/comparator.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "leveldb/comparator.h" -#include "leveldb/slice.h" -#include "util/logging.h" - -namespace leveldb { - -Comparator::~Comparator() { } - -namespace { -class BytewiseComparatorImpl : public Comparator { - public: - BytewiseComparatorImpl() { } - - virtual const char* Name() const { - return "leveldb.BytewiseComparator"; - } - - virtual int Compare(const Slice& a, const Slice& b) const { - return a.compare(b); - } - - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const { - // Find length of common prefix - size_t min_length = std::min(start->size(), limit.size()); - size_t diff_index = 0; - while ((diff_index < min_length) && - ((*start)[diff_index] == limit[diff_index])) { - diff_index++; - } - - if (diff_index >= min_length) { - // Do not shorten if one string is a prefix of the other - } else { - uint8_t diff_byte = static_cast((*start)[diff_index]); - if (diff_byte < static_cast(0xff) && - diff_byte + 1 < static_cast(limit[diff_index])) { - (*start)[diff_index]++; - start->resize(diff_index + 1); - assert(Compare(*start, limit) < 0); - } - } - } - - virtual void FindShortSuccessor(std::string* key) const { - // Find first character that can be incremented - size_t n = key->size(); - for (size_t i = 0; i < n; i++) { - const uint8_t byte = (*key)[i]; - if (byte != static_cast(0xff)) { - (*key)[i] = byte + 1; - key->resize(i+1); - return; - } - } - // *key is a run of 0xffs. Leave it alone. - } -}; -} -static const BytewiseComparatorImpl bytewise; - -const Comparator* BytewiseComparator() { - return &bytewise; -} - -} diff --git a/leveldb/util/crc32c.cc b/leveldb/util/crc32c.cc deleted file mode 100644 index 28c2401..0000000 --- a/leveldb/util/crc32c.cc +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A portable implementation of crc32c, optimized to handle -// four bytes at a time. - -#include "util/crc32c.h" - -#include -#include "util/coding.h" - -namespace leveldb { -namespace crc32c { - -static const uint32_t table0_[256] = { - 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, - 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, - 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, - 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, - 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, - 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, - 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, - 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, - 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, - 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, - 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, - 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, - 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, - 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, - 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, - 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, - 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, - 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, - 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, - 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, - 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, - 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, - 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, - 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, - 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, - 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, - 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, - 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, - 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, - 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, - 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, - 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, - 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, - 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, - 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, - 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, - 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, - 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, - 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, - 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, - 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, - 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, - 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, - 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, - 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, - 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, - 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, - 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, - 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, - 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, - 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, - 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, - 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, - 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, - 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, - 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, - 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, - 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, - 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, - 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, - 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, - 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, - 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, - 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 -}; -static const uint32_t table1_[256] = { - 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, - 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, - 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, - 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, - 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, - 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, - 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, - 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, - 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, - 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, - 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, - 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, - 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, - 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, - 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, - 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, - 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, - 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, - 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, - 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, - 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, - 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, - 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, - 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, - 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, - 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, - 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, - 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, - 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, - 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, - 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, - 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, - 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, - 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, - 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, - 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, - 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, - 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, - 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, - 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, - 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, - 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, - 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, - 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, - 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, - 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, - 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, - 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, - 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, - 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, - 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, - 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, - 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, - 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, - 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, - 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, - 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, - 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, - 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, - 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, - 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, - 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, - 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, - 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 -}; -static const uint32_t table2_[256] = { - 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, - 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, - 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, - 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, - 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, - 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, - 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, - 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, - 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, - 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, - 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, - 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, - 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, - 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, - 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, - 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, - 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, - 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, - 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, - 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, - 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, - 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, - 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, - 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, - 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, - 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, - 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, - 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, - 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, - 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, - 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, - 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, - 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, - 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, - 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, - 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, - 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, - 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, - 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, - 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, - 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, - 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, - 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, - 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, - 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, - 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, - 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, - 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, - 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, - 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, - 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, - 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, - 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, - 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, - 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, - 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, - 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, - 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, - 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, - 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, - 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, - 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, - 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, - 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 -}; -static const uint32_t table3_[256] = { - 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, - 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, - 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, - 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, - 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, - 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, - 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, - 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, - 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, - 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, - 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, - 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, - 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, - 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, - 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, - 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, - 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, - 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, - 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, - 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, - 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, - 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, - 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, - 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, - 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, - 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, - 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, - 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, - 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, - 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, - 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, - 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, - 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, - 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, - 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, - 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, - 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, - 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, - 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, - 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, - 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, - 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, - 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, - 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, - 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, - 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, - 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, - 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, - 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, - 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, - 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, - 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, - 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, - 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, - 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, - 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, - 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, - 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, - 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, - 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, - 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, - 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, - 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, - 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 -}; - -// Used to fetch a naturally-aligned 32-bit word in little endian byte-order -static inline uint32_t LE_LOAD32(const uint8_t *p) { - return DecodeFixed32(reinterpret_cast(p)); -} - -uint32_t Extend(uint32_t crc, const char* buf, size_t size) { - const uint8_t *p = reinterpret_cast(buf); - const uint8_t *e = p + size; - uint32_t l = crc ^ 0xffffffffu; - -#define STEP1 do { \ - int c = (l & 0xff) ^ *p++; \ - l = table0_[c] ^ (l >> 8); \ -} while (0) -#define STEP4 do { \ - uint32_t c = l ^ LE_LOAD32(p); \ - p += 4; \ - l = table3_[c & 0xff] ^ \ - table2_[(c >> 8) & 0xff] ^ \ - table1_[(c >> 16) & 0xff] ^ \ - table0_[c >> 24]; \ -} while (0) - - // Point x at first 4-byte aligned byte in string. This might be - // just past the end of the string. - const uintptr_t pval = reinterpret_cast(p); - const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); - if (x <= e) { - // Process bytes until finished or p is 4-byte aligned - while (p != x) { - STEP1; - } - } - // Process bytes 16 at a time - while ((e-p) >= 16) { - STEP4; STEP4; STEP4; STEP4; - } - // Process bytes 4 at a time - while ((e-p) >= 4) { - STEP4; - } - // Process the last few bytes - while (p != e) { - STEP1; - } -#undef STEP4 -#undef STEP1 - return l ^ 0xffffffffu; -} - -} -} diff --git a/leveldb/util/crc32c.h b/leveldb/util/crc32c.h deleted file mode 100644 index 938d8ff..0000000 --- a/leveldb/util/crc32c.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ -#define STORAGE_LEVELDB_UTIL_CRC32C_H_ - -#include -#include - -namespace leveldb { -namespace crc32c { - -// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the -// crc32c of some string A. Extend() is often used to maintain the -// crc32c of a stream of data. -extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); - -// Return the crc32c of data[0,n-1] -inline uint32_t Value(const char* data, size_t n) { - return Extend(0, data, n); -} - -static const uint32_t kMaskDelta = 0xa282ead8ul; - -// Return a masked representation of crc. -// -// Motivation: it is problematic to compute the CRC of a string that -// contains embedded CRCs. Therefore we recommend that CRCs stored -// somewhere (e.g., in files) should be masked before being stored. -inline uint32_t Mask(uint32_t crc) { - // Rotate right by 15 bits and add a constant. - return ((crc >> 15) | (crc << 17)) + kMaskDelta; -} - -// Return the crc whose masked representation is masked_crc. -inline uint32_t Unmask(uint32_t masked_crc) { - uint32_t rot = masked_crc - kMaskDelta; - return ((rot >> 17) | (rot << 15)); -} - -} -} - -#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ diff --git a/leveldb/util/crc32c_test.cc b/leveldb/util/crc32c_test.cc deleted file mode 100644 index ba9e804..0000000 --- a/leveldb/util/crc32c_test.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/crc32c.h" -#include "util/testharness.h" - -namespace leveldb { -namespace crc32c { - -class CRC { }; - -TEST(CRC, StandardResults) { - // From rfc3720 section B.4. - char buf[32]; - - memset(buf, 0, sizeof(buf)); - ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); - - memset(buf, 0xff, sizeof(buf)); - ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); - - for (int i = 0; i < 32; i++) { - buf[i] = i; - } - ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); - - for (int i = 0; i < 32; i++) { - buf[i] = 31 - i; - } - ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); - - unsigned char data[48] = { - 0x01, 0xc0, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x04, 0x00, - 0x00, 0x00, 0x00, 0x14, - 0x00, 0x00, 0x00, 0x18, - 0x28, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - }; - ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); -} - -TEST(CRC, Values) { - ASSERT_NE(Value("a", 1), Value("foo", 3)); -} - -TEST(CRC, Extend) { - ASSERT_EQ(Value("hello world", 11), - Extend(Value("hello ", 6), "world", 5)); -} - -TEST(CRC, Mask) { - uint32_t crc = Value("foo", 3); - ASSERT_NE(crc, Mask(crc)); - ASSERT_NE(crc, Mask(Mask(crc))); - ASSERT_EQ(crc, Unmask(Mask(crc))); - ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); -} - -} -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/env.cc b/leveldb/util/env.cc deleted file mode 100644 index e5297e7..0000000 --- a/leveldb/util/env.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/env.h" - -namespace leveldb { - -Env::~Env() { -} - -SequentialFile::~SequentialFile() { -} - -RandomAccessFile::~RandomAccessFile() { -} - -WritableFile::~WritableFile() { -} - -FileLock::~FileLock() { -} - -void Log(Env* env, WritableFile* info_log, const char* format, ...) { - va_list ap; - va_start(ap, format); - env->Logv(info_log, format, ap); - va_end(ap); -} - -Status WriteStringToFile(Env* env, const Slice& data, - const std::string& fname) { - WritableFile* file; - Status s = env->NewWritableFile(fname, &file); - if (!s.ok()) { - return s; - } - s = file->Append(data); - if (s.ok()) { - s = file->Close(); - } - delete file; // Will auto-close if we did not close above - if (!s.ok()) { - env->DeleteFile(fname); - } - return s; -} - -Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { - data->clear(); - SequentialFile* file; - Status s = env->NewSequentialFile(fname, &file); - if (!s.ok()) { - return s; - } - static const int kBufferSize = 8192; - char* space = new char[kBufferSize]; - while (true) { - Slice fragment; - s = file->Read(kBufferSize, &fragment, space); - if (!s.ok()) { - break; - } - data->append(fragment.data(), fragment.size()); - if (fragment.empty()) { - break; - } - } - delete[] space; - delete file; - return s; -} - -EnvWrapper::~EnvWrapper() { -} - -} diff --git a/leveldb/util/env_chromium.cc b/leveldb/util/env_chromium.cc deleted file mode 100644 index 7edc7a9..0000000 --- a/leveldb/util/env_chromium.cc +++ /dev/null @@ -1,603 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include "base/at_exit.h" -#include "base/file_path.h" -#include "base/file_util.h" -#include "base/lazy_instance.h" -#include "base/memory/ref_counted.h" -#include "base/message_loop.h" -#include "base/platform_file.h" -#include "base/process_util.h" -#include "base/synchronization/lock.h" -#include "base/sys_info.h" -#include "base/task.h" -#include "base/threading/platform_thread.h" -#include "base/threading/thread.h" -#include "base/utf_string_conversions.h" -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "port/port.h" -#include "util/logging.h" - -#if defined(OS_WIN) -#include -#include "base/win/win_util.h" -#endif - -#if defined(OS_MACOSX) || defined(OS_WIN) -// The following are glibc-specific -extern "C" { -size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { - return fread(ptr, size, n, file); -} - -size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) { - return fwrite(ptr, size, n, file); -} - -int fflush_unlocked(FILE *file) { - return fflush(file); -} - -int fdatasync(int fildes) { -#if defined(OS_WIN) - return _commit(fildes); -#else - return fsync(fildes); -#endif -} -} -#endif - -namespace leveldb { - -namespace { - -class Thread; - -static const ::FilePath::CharType kLevelDBTestDirectoryPrefix[] - = FILE_PATH_LITERAL("leveldb-test-"); - -::FilePath CreateFilePath(const std::string& file_path) { -#if defined(OS_WIN) - return FilePath(UTF8ToUTF16(file_path)); -#else - return FilePath(file_path); -#endif -} - -std::string FilePathToString(const ::FilePath& file_path) { -#if defined(OS_WIN) - return UTF16ToUTF8(file_path.value()); -#else - return file_path.value(); -#endif -} - -// TODO(jorlow): This should be moved into Chromium's base. -const char* PlatformFileErrorString(const ::base::PlatformFileError& error) { - switch (error) { - case ::base::PLATFORM_FILE_ERROR_FAILED: - return "Opening file failed."; - case ::base::PLATFORM_FILE_ERROR_IN_USE: - return "File currently in use."; - case ::base::PLATFORM_FILE_ERROR_EXISTS: - return "File already exists."; - case ::base::PLATFORM_FILE_ERROR_NOT_FOUND: - return "File not found."; - case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED: - return "Access denied."; - case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED: - return "Too many files open."; - case ::base::PLATFORM_FILE_ERROR_NO_MEMORY: - return "Out of memory."; - case ::base::PLATFORM_FILE_ERROR_NO_SPACE: - return "No space left on drive."; - case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY: - return "Not a directory."; - case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION: - return "Invalid operation."; - case ::base::PLATFORM_FILE_ERROR_SECURITY: - return "Security error."; - case ::base::PLATFORM_FILE_ERROR_ABORT: - return "File operation aborted."; - case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE: - return "The supplied path was not a file."; - case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY: - return "The file was not empty."; - } - NOTIMPLEMENTED(); - return "Unknown error."; -} - -class ChromiumSequentialFile: public SequentialFile { - private: - std::string filename_; - FILE* file_; - - public: - ChromiumSequentialFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - virtual ~ChromiumSequentialFile() { fclose(file_); } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - Status s; - size_t r = fread_unlocked(scratch, 1, n, file_); - *result = Slice(scratch, r); - if (r < n) { - if (feof(file_)) { - // We leave status as ok if we hit the end of the file - } else { - // A partial read with an error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - } - return s; - } -}; - -class ChromiumRandomAccessFile: public RandomAccessFile { - private: - std::string filename_; - ::base::PlatformFile file_; - - public: - ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file) - : filename_(fname), file_(file) { } - virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; - int r = ::base::ReadPlatformFile(file_, offset, scratch, n); - *result = Slice(scratch, (r < 0) ? 0 : r); - if (r < 0) { - // An error: return a non-ok status - s = Status::IOError(filename_, "Could not preform read"); - } - return s; - } -}; - -class ChromiumWritableFile : public WritableFile { - private: - std::string filename_; - FILE* file_; - - public: - ChromiumWritableFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - - ~ChromiumWritableFile() { - if (file_ != NULL) { - // Ignoring any potential errors - fclose(file_); - } - } - - virtual Status Append(const Slice& data) { - size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); - Status result; - if (r != data.size()) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } - - virtual Status Close() { - Status result; - if (fclose(file_) != 0) { - result = Status::IOError(filename_, strerror(errno)); - } - file_ = NULL; - return result; - } - - virtual Status Flush() { - Status result; - if (fflush_unlocked(file_) != 0) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } - - virtual Status Sync() { - Status result; - if ((fflush_unlocked(file_) != 0) || - (fdatasync(fileno(file_)) != 0)) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } -}; - -class ChromiumFileLock : public FileLock { - public: - ::base::PlatformFile file_; -}; - -class ChromiumEnv : public Env { - public: - ChromiumEnv(); - virtual ~ChromiumEnv() { - fprintf(stderr, "Destroying Env::Default()\n"); - exit(1); - } - - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) { - FILE* f = fopen(fname.c_str(), "rb"); - if (f == NULL) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } else { - *result = new ChromiumSequentialFile(fname, f); - return Status::OK(); - } - } - - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) { - int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN; - bool created; - ::base::PlatformFileError error_code; - ::base::PlatformFile file = ::base::CreatePlatformFile( - CreateFilePath(fname), flags, &created, &error_code); - if (error_code != ::base::PLATFORM_FILE_OK) { - *result = NULL; - return Status::IOError(fname, PlatformFileErrorString(error_code)); - } - *result = new ChromiumRandomAccessFile(fname, file); - return Status::OK(); - } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - *result = NULL; - FILE* f = fopen(fname.c_str(), "wb"); - if (f == NULL) { - return Status::IOError(fname, strerror(errno)); - } else { - *result = new ChromiumWritableFile(fname, f); - return Status::OK(); - } - } - - virtual bool FileExists(const std::string& fname) { - return ::file_util::PathExists(CreateFilePath(fname)); - } - - virtual Status GetChildren(const std::string& dir, - std::vector* result) { - result->clear(); - ::file_util::FileEnumerator iter( - CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES); - ::FilePath current = iter.Next(); - while (!current.empty()) { - result->push_back(FilePathToString(current.BaseName())); - current = iter.Next(); - } - // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so - // we'll always return OK. Maybe manually check for error - // conditions like the file not existing? - return Status::OK(); - } - - virtual Status DeleteFile(const std::string& fname) { - Status result; - // TODO(jorlow): Should we assert this is a file? - if (!::file_util::Delete(CreateFilePath(fname), false)) { - result = Status::IOError(fname, "Could not delete file."); - } - return result; - }; - - virtual Status CreateDir(const std::string& name) { - Status result; - if (!::file_util::CreateDirectory(CreateFilePath(name))) { - result = Status::IOError(name, "Could not create directory."); - } - return result; - }; - - virtual Status DeleteDir(const std::string& name) { - Status result; - // TODO(jorlow): Should we assert this is a directory? - if (!::file_util::Delete(CreateFilePath(name), false)) { - result = Status::IOError(name, "Could not delete directory."); - } - return result; - }; - - virtual Status GetFileSize(const std::string& fname, uint64_t* size) { - Status s; - int64_t signed_size; - if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { - *size = 0; - s = Status::IOError(fname, "Could not determine file size."); - } else { - *size = static_cast(signed_size); - } - return s; - } - - virtual Status RenameFile(const std::string& src, const std::string& dst) { - Status result; - if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) { - result = Status::IOError(src, "Could not rename file."); - } - return result; - } - - virtual Status LockFile(const std::string& fname, FileLock** lock) { - *lock = NULL; - Status result; - int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS | - ::base::PLATFORM_FILE_READ | - ::base::PLATFORM_FILE_WRITE | - ::base::PLATFORM_FILE_EXCLUSIVE_READ | - ::base::PLATFORM_FILE_EXCLUSIVE_WRITE; - bool created; - ::base::PlatformFileError error_code; - ::base::PlatformFile file = ::base::CreatePlatformFile( - CreateFilePath(fname), flags, &created, &error_code); - if (error_code != ::base::PLATFORM_FILE_OK) { - result = Status::IOError(fname, PlatformFileErrorString(error_code)); - } else { - ChromiumFileLock* my_lock = new ChromiumFileLock; - my_lock->file_ = file; - *lock = my_lock; - } - return result; - } - - virtual Status UnlockFile(FileLock* lock) { - ChromiumFileLock* my_lock = reinterpret_cast(lock); - Status result; - if (!::base::ClosePlatformFile(my_lock->file_)) { - result = Status::IOError("Could not close lock file."); - } - delete my_lock; - return result; - } - - virtual void Schedule(void (*function)(void*), void* arg); - - virtual void StartThread(void (*function)(void* arg), void* arg); - - virtual std::string UserIdentifier() { -#if defined(OS_WIN) - std::wstring user_sid; - bool ret = ::base::win::GetUserSidString(&user_sid); - DCHECK(ret); - return UTF16ToUTF8(user_sid); -#else - char buf[100]; - snprintf(buf, sizeof(buf), "%d", int(geteuid())); - return buf; -#endif - } - - virtual Status GetTestDirectory(std::string* path) { - mu_.Acquire(); - if (test_directory_.empty()) { - if (!::file_util::CreateNewTempDirectory(kLevelDBTestDirectoryPrefix, - &test_directory_)) { - mu_.Release(); - return Status::IOError("Could not create temp directory."); - } - } - *path = FilePathToString(test_directory_); - mu_.Release(); - return Status::OK(); - } - - virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { - // TODO(jorlow): We may want to just use Chromium's built in logging. - - uint64_t thread_id = 0; - // Coppied from base/logging.cc. -#if defined(OS_WIN) - thread_id = GetCurrentThreadId(); -#elif defined(OS_MACOSX) - thread_id = mach_thread_self(); -#elif defined(OS_LINUX) - thread_id = syscall(__NR_gettid); -#elif defined(OS_FREEBSD) || defined(OS_NACL) - // TODO(BSD): find a better thread ID - pthread_t tid = pthread_self(); - memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); -#endif - - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - ::base::Time::Exploded t; - ::base::Time::Now().LocalExplode(&t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.year, - t.month, - t.day_of_month, - t.hour, - t.minute, - t.second, - static_cast(t.millisecond) * 1000, - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - info_log->Append(Slice(base, p - base)); - info_log->Flush(); - if (base != buffer) { - delete[] base; - } - break; - } - } - - virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) { - ::base::Time::Exploded t; - ::base::Time::Now().LocalExplode(&t); - return snprintf(buffer, size, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d", - t.year, - t.month, - t.day_of_month, - t.hour, - t.minute, - t.second, - static_cast(t.millisecond) * 1000); - } - - virtual uint64_t NowMicros() { - return ::base::TimeTicks::HighResNow().ToInternalValue(); - } - - virtual void SleepForMicroseconds(int micros) { - // Round up to the next millisecond. - ::base::PlatformThread::Sleep((micros + 999) / 1000); - } - - private: - // BGThread() is the body of the background thread - void BGThread(); - static void BGThreadWrapper(void* arg) { - reinterpret_cast(arg)->BGThread(); - } - - FilePath test_directory_; - - size_t page_size_; - ::base::Lock mu_; - ::base::ConditionVariable bgsignal_; - bool started_bgthread_; - - // Entry per Schedule() call - struct BGItem { void* arg; void (*function)(void*); }; - typedef std::deque BGQueue; - BGQueue queue_; -}; - -ChromiumEnv::ChromiumEnv() - : page_size_(::base::SysInfo::VMAllocationGranularity()), - bgsignal_(&mu_), - started_bgthread_(false) { -#if defined(OS_MACOSX) - ::base::EnableTerminationOnHeapCorruption(); - ::base::EnableTerminationOnOutOfMemory(); -#endif // OS_MACOSX -} - -class Thread : public ::base::PlatformThread::Delegate { - public: - Thread(void (*function)(void* arg), void* arg) - : function_(function), arg_(arg) { - ::base::PlatformThreadHandle handle; - bool success = ::base::PlatformThread::Create(0, this, &handle); - DCHECK(success); - } - virtual ~Thread() {} - virtual void ThreadMain() { - (*function_)(arg_); - delete this; - } - - private: - void (*function_)(void* arg); - void* arg_; -}; - -void ChromiumEnv::Schedule(void (*function)(void*), void* arg) { - mu_.Acquire(); - - // Start background thread if necessary - if (!started_bgthread_) { - started_bgthread_ = true; - StartThread(&ChromiumEnv::BGThreadWrapper, this); - } - - // If the queue is currently empty, the background thread may currently be - // waiting. - if (queue_.empty()) { - bgsignal_.Signal(); - } - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - - mu_.Release(); -} - -void ChromiumEnv::BGThread() { - while (true) { - // Wait until there is an item that is ready to run - mu_.Acquire(); - while (queue_.empty()) { - bgsignal_.Wait(); - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - - mu_.Release(); - (*function)(arg); - } -} - -void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { - new Thread(function, arg); // Will self-delete. -} - -::base::LazyInstance > - default_env(::base::LINKER_INITIALIZED); - -} - -Env* Env::Default() { - return default_env.Pointer(); -} - -} diff --git a/leveldb/util/env_posix.cc b/leveldb/util/env_posix.cc deleted file mode 100644 index 5cddb0c..0000000 --- a/leveldb/util/env_posix.cc +++ /dev/null @@ -1,599 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(LEVELDB_PLATFORM_ANDROID) -#include -#endif -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "port/port.h" -#include "util/logging.h" - -namespace leveldb { - -namespace { - -class PosixSequentialFile: public SequentialFile { - private: - std::string filename_; - FILE* file_; - - public: - PosixSequentialFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - virtual ~PosixSequentialFile() { fclose(file_); } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - Status s; - size_t r = fread_unlocked(scratch, 1, n, file_); - *result = Slice(scratch, r); - if (r < n) { - if (feof(file_)) { - // We leave status as ok if we hit the end of the file - } else { - // A partial read with an error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - } - return s; - } -}; - -class PosixRandomAccessFile: public RandomAccessFile { - private: - std::string filename_; - int fd_; - - public: - PosixRandomAccessFile(const std::string& fname, int fd) - : filename_(fname), fd_(fd) { } - virtual ~PosixRandomAccessFile() { close(fd_); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; - ssize_t r = pread(fd_, scratch, n, static_cast(offset)); - *result = Slice(scratch, (r < 0) ? 0 : r); - if (r < 0) { - // An error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - return s; - } -}; - -// We preallocate up to an extra megabyte and use memcpy to append new -// data to the file. This is safe since we either properly close the -// file before reading from it, or for log files, the reading code -// knows enough to skip zero suffixes. -class PosixMmapFile : public WritableFile { - private: - std::string filename_; - int fd_; - size_t page_size_; - size_t map_size_; // How much extra memory to map at a time - char* base_; // The mapped region - char* limit_; // Limit of the mapped region - char* dst_; // Where to write next (in range [base_,limit_]) - char* last_sync_; // Where have we synced up to - uint64_t file_offset_; // Offset of base_ in file - - // Have we done an munmap of unsynced data? - bool pending_sync_; - - // Roundup x to a multiple of y - static size_t Roundup(size_t x, size_t y) { - return ((x + y - 1) / y) * y; - } - - size_t TruncateToPageBoundary(size_t s) { - s -= (s & (page_size_ - 1)); - assert((s % page_size_) == 0); - return s; - } - - void UnmapCurrentRegion() { - if (base_ != NULL) { - if (last_sync_ < limit_) { - // Defer syncing this data until next Sync() call, if any - pending_sync_ = true; - } - munmap(base_, limit_ - base_); - file_offset_ += limit_ - base_; - base_ = NULL; - limit_ = NULL; - last_sync_ = NULL; - dst_ = NULL; - - // Increase the amount we map the next time, but capped at 1MB - if (map_size_ < (1<<20)) { - map_size_ *= 2; - } - } - } - - bool MapNewRegion() { - assert(base_ == NULL); - if (ftruncate(fd_, file_offset_ + map_size_) < 0) { - return false; - } - void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, - fd_, file_offset_); - if (ptr == MAP_FAILED) { - return false; - } - base_ = reinterpret_cast(ptr); - limit_ = base_ + map_size_; - dst_ = base_; - last_sync_ = base_; - return true; - } - - public: - PosixMmapFile(const std::string& fname, int fd, size_t page_size) - : filename_(fname), - fd_(fd), - page_size_(page_size), - map_size_(Roundup(65536, page_size)), - base_(NULL), - limit_(NULL), - dst_(NULL), - last_sync_(NULL), - file_offset_(0), - pending_sync_(false) { - assert((page_size & (page_size - 1)) == 0); - } - - - ~PosixMmapFile() { - if (fd_ >= 0) { - PosixMmapFile::Close(); - } - } - - virtual Status Append(const Slice& data) { - const char* src = data.data(); - size_t left = data.size(); - while (left > 0) { - assert(base_ <= dst_); - assert(dst_ <= limit_); - size_t avail = limit_ - dst_; - if (avail == 0) { - UnmapCurrentRegion(); - MapNewRegion(); - } - - size_t n = (left <= avail) ? left : avail; - memcpy(dst_, src, n); - dst_ += n; - src += n; - left -= n; - } - return Status::OK(); - } - - virtual Status Close() { - Status s; - size_t unused = limit_ - dst_; - UnmapCurrentRegion(); - if (unused > 0) { - // Trim the extra space at the end of the file - if (ftruncate(fd_, file_offset_ - unused) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - if (close(fd_) < 0) { - if (s.ok()) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - fd_ = -1; - base_ = NULL; - limit_ = NULL; - return s; - } - - virtual Status Flush() { - return Status::OK(); - } - - virtual Status Sync() { - Status s; - - if (pending_sync_) { - // Some unmapped data was not synced - pending_sync_ = false; - if (fdatasync(fd_) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - if (dst_ > last_sync_) { - // Find the beginnings of the pages that contain the first and last - // bytes to be synced. - size_t p1 = TruncateToPageBoundary(last_sync_ - base_); - size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); - last_sync_ = dst_; - if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { - s = Status::IOError(filename_, strerror(errno)); - } - } - - return s; - } -}; - -static int LockOrUnlock(int fd, bool lock) { - errno = 0; - struct flock f; - memset(&f, 0, sizeof(f)); - f.l_type = (lock ? F_WRLCK : F_UNLCK); - f.l_whence = SEEK_SET; - f.l_start = 0; - f.l_len = 0; // Lock/unlock entire file - return fcntl(fd, F_SETLK, &f); -} - -class PosixFileLock : public FileLock { - public: - int fd_; -}; - -class PosixEnv : public Env { - public: - PosixEnv(); - virtual ~PosixEnv() { - fprintf(stderr, "Destroying Env::Default()\n"); - exit(1); - } - - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) { - FILE* f = fopen(fname.c_str(), "r"); - if (f == NULL) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } else { - *result = new PosixSequentialFile(fname, f); - return Status::OK(); - } - } - - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) { - int fd = open(fname.c_str(), O_RDONLY); - if (fd < 0) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } - *result = new PosixRandomAccessFile(fname, fd); - return Status::OK(); - } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - Status s; - const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd < 0) { - *result = NULL; - s = Status::IOError(fname, strerror(errno)); - } else { - *result = new PosixMmapFile(fname, fd, page_size_); - } - return s; - } - - virtual bool FileExists(const std::string& fname) { - return access(fname.c_str(), F_OK) == 0; - } - - virtual Status GetChildren(const std::string& dir, - std::vector* result) { - result->clear(); - DIR* d = opendir(dir.c_str()); - if (d == NULL) { - return Status::IOError(dir, strerror(errno)); - } - struct dirent* entry; - while ((entry = readdir(d)) != NULL) { - result->push_back(entry->d_name); - } - closedir(d); - return Status::OK(); - } - - virtual Status DeleteFile(const std::string& fname) { - Status result; - if (unlink(fname.c_str()) != 0) { - result = Status::IOError(fname, strerror(errno)); - } - return result; - }; - - virtual Status CreateDir(const std::string& name) { - Status result; - if (mkdir(name.c_str(), 0755) != 0) { - result = Status::IOError(name, strerror(errno)); - } - return result; - }; - - virtual Status DeleteDir(const std::string& name) { - Status result; - if (rmdir(name.c_str()) != 0) { - result = Status::IOError(name, strerror(errno)); - } - return result; - }; - - virtual Status GetFileSize(const std::string& fname, uint64_t* size) { - Status s; - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - *size = 0; - s = Status::IOError(fname, strerror(errno)); - } else { - *size = sbuf.st_size; - } - return s; - } - - virtual Status RenameFile(const std::string& src, const std::string& target) { - Status result; - if (rename(src.c_str(), target.c_str()) != 0) { - result = Status::IOError(src, strerror(errno)); - } - return result; - } - - virtual Status LockFile(const std::string& fname, FileLock** lock) { - *lock = NULL; - Status result; - int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); - if (fd < 0) { - result = Status::IOError(fname, strerror(errno)); - } else if (LockOrUnlock(fd, true) == -1) { - result = Status::IOError("lock " + fname, strerror(errno)); - close(fd); - } else { - PosixFileLock* my_lock = new PosixFileLock; - my_lock->fd_ = fd; - *lock = my_lock; - } - return result; - } - - virtual Status UnlockFile(FileLock* lock) { - PosixFileLock* my_lock = reinterpret_cast(lock); - Status result; - if (LockOrUnlock(my_lock->fd_, false) == -1) { - result = Status::IOError(strerror(errno)); - } - close(my_lock->fd_); - delete my_lock; - return result; - } - - virtual void Schedule(void (*function)(void*), void* arg); - - virtual void StartThread(void (*function)(void* arg), void* arg); - - virtual Status GetTestDirectory(std::string* result) { - const char* env = getenv("TEST_TMPDIR"); - if (env && env[0] != '\0') { - *result = env; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); - *result = buf; - } - // Directory may already exist - CreateDir(*result); - return Status::OK(); - } - - virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { - pthread_t tid = pthread_self(); - uint64_t thread_id = 0; - memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); - - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - struct timeval now_tv; - gettimeofday(&now_tv, NULL); - const time_t seconds = now_tv.tv_sec; - struct tm t; - localtime_r(&seconds, &t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.tm_year + 1900, - t.tm_mon + 1, - t.tm_mday, - t.tm_hour, - t.tm_min, - t.tm_sec, - static_cast(now_tv.tv_usec), - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - info_log->Append(Slice(base, p - base)); - info_log->Flush(); - if (base != buffer) { - delete[] base; - } - break; - } - } - - virtual uint64_t NowMicros() { - struct timeval tv; - gettimeofday(&tv, NULL); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; - } - - virtual void SleepForMicroseconds(int micros) { - usleep(micros); - } - - private: - void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - exit(1); - } - } - - // BGThread() is the body of the background thread - void BGThread(); - static void* BGThreadWrapper(void* arg) { - reinterpret_cast(arg)->BGThread(); - return NULL; - } - - size_t page_size_; - pthread_mutex_t mu_; - pthread_cond_t bgsignal_; - pthread_t bgthread_; - bool started_bgthread_; - - // Entry per Schedule() call - struct BGItem { void* arg; void (*function)(void*); }; - typedef std::deque BGQueue; - BGQueue queue_; -}; - -PosixEnv::PosixEnv() : page_size_(getpagesize()), - started_bgthread_(false) { - PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); - PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); -} - -void PosixEnv::Schedule(void (*function)(void*), void* arg) { - PthreadCall("lock", pthread_mutex_lock(&mu_)); - - // Start background thread if necessary - if (!started_bgthread_) { - started_bgthread_ = true; - PthreadCall( - "create thread", - pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); - } - - // If the queue is currently empty, the background thread may currently be - // waiting. - if (queue_.empty()) { - PthreadCall("signal", pthread_cond_signal(&bgsignal_)); - } - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); -} - -void PosixEnv::BGThread() { - while (true) { - // Wait until there is an item that is ready to run - PthreadCall("lock", pthread_mutex_lock(&mu_)); - while (queue_.empty()) { - PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); - (*function)(arg); - } -} - -namespace { -struct StartThreadState { - void (*user_function)(void*); - void* arg; -}; -} -static void* StartThreadWrapper(void* arg) { - StartThreadState* state = reinterpret_cast(arg); - state->user_function(state->arg); - delete state; - return NULL; -} - -void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { - pthread_t t; - StartThreadState* state = new StartThreadState; - state->user_function = function; - state->arg = arg; - PthreadCall("start thread", - pthread_create(&t, NULL, &StartThreadWrapper, state)); -} - -} - -static pthread_once_t once = PTHREAD_ONCE_INIT; -static Env* default_env; -static void InitDefaultEnv() { default_env = new PosixEnv; } - -Env* Env::Default() { - pthread_once(&once, InitDefaultEnv); - return default_env; -} - -} diff --git a/leveldb/util/env_test.cc b/leveldb/util/env_test.cc deleted file mode 100644 index 3c253be..0000000 --- a/leveldb/util/env_test.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/env.h" - -#include "port/port.h" -#include "util/testharness.h" - -namespace leveldb { - -static const int kDelayMicros = 100000; - -class EnvPosixTest { - private: - port::Mutex mu_; - std::string events_; - - public: - Env* env_; - EnvPosixTest() : env_(Env::Default()) { } -}; - -static void SetBool(void* ptr) { - *(reinterpret_cast(ptr)) = true; -} - -TEST(EnvPosixTest, RunImmediately) { - bool called = false; - env_->Schedule(&SetBool, &called); - Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_TRUE(called); -} - -TEST(EnvPosixTest, RunMany) { - int last_id = 0; - - struct CB { - int* last_id_ptr; // Pointer to shared slot - int id; // Order# for the execution of this callback - - CB(int* p, int i) : last_id_ptr(p), id(i) { } - - static void Run(void* v) { - CB* cb = reinterpret_cast(v); - ASSERT_EQ(cb->id-1, *cb->last_id_ptr); - *cb->last_id_ptr = cb->id; - } - }; - - // Schedule in different order than start time - CB cb1(&last_id, 1); - CB cb2(&last_id, 2); - CB cb3(&last_id, 3); - CB cb4(&last_id, 4); - env_->Schedule(&CB::Run, &cb1); - env_->Schedule(&CB::Run, &cb2); - env_->Schedule(&CB::Run, &cb3); - env_->Schedule(&CB::Run, &cb4); - - Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_EQ(4, last_id); -} - -struct State { - port::Mutex mu; - int val; - int num_running; -}; - -static void ThreadBody(void* arg) { - State* s = reinterpret_cast(arg); - s->mu.Lock(); - s->val += 1; - s->num_running -= 1; - s->mu.Unlock(); -} - -TEST(EnvPosixTest, StartThread) { - State state; - state.val = 0; - state.num_running = 3; - for (int i = 0; i < 3; i++) { - env_->StartThread(&ThreadBody, &state); - } - while (true) { - state.mu.Lock(); - int num = state.num_running; - state.mu.Unlock(); - if (num == 0) { - break; - } - Env::Default()->SleepForMicroseconds(kDelayMicros); - } - ASSERT_EQ(state.val, 3); -} - -} - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/leveldb/util/hash.cc b/leveldb/util/hash.cc deleted file mode 100644 index d19afd1..0000000 --- a/leveldb/util/hash.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "util/coding.h" -#include "util/hash.h" - -namespace leveldb { - -uint32_t Hash(const char* data, size_t n, uint32_t seed) { - // Similar to murmur hash - const uint32_t m = 0xc6a4a793; - const uint32_t r = 24; - const char* limit = data + n; - uint32_t h = seed ^ (n * m); - - // Pick up four bytes at a time - while (data + 4 <= limit) { - uint32_t w = DecodeFixed32(data); - data += 4; - h += w; - h *= m; - h ^= (h >> 16); - } - - // Pick up remaining bytes - switch (limit - data) { - case 3: - h += data[2] << 16; - // fall through - case 2: - h += data[1] << 8; - // fall through - case 1: - h += data[0]; - h *= m; - h ^= (h >> r); - break; - } - return h; -} - - -} diff --git a/leveldb/util/hash.h b/leveldb/util/hash.h deleted file mode 100644 index 8889d56..0000000 --- a/leveldb/util/hash.h +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Simple hash function used for internal data structures - -#ifndef STORAGE_LEVELDB_UTIL_HASH_H_ -#define STORAGE_LEVELDB_UTIL_HASH_H_ - -#include -#include - -namespace leveldb { - -extern uint32_t Hash(const char* data, size_t n, uint32_t seed); - -} - -#endif // STORAGE_LEVELDB_UTIL_HASH_H_ diff --git a/leveldb/util/histogram.cc b/leveldb/util/histogram.cc deleted file mode 100644 index c5178ef..0000000 --- a/leveldb/util/histogram.cc +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include "port/port.h" -#include "util/histogram.h" - -namespace leveldb { - -const double Histogram::kBucketLimit[kNumBuckets] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, - 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, - 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, - 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, - 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, - 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, - 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, - 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, - 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, - 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, - 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, - 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, - 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, - 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, - 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000, - 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0, - 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0, - 1e200, -}; - -void Histogram::Clear() { - min_ = kBucketLimit[kNumBuckets-1]; - max_ = 0; - num_ = 0; - sum_ = 0; - sum_squares_ = 0; - for (int i = 0; i < kNumBuckets; i++) { - buckets_[i] = 0; - } -} - -void Histogram::Add(double value) { - // Linear search is fast enough for our usage in db_bench - int b = 0; - while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { - b++; - } - buckets_[b] += 1.0; - if (min_ > value) min_ = value; - if (max_ < value) max_ = value; - num_++; - sum_ += value; - sum_squares_ += (value * value); -} - -double Histogram::Median() const { - return Percentile(50.0); -} - -double Histogram::Percentile(double p) const { - double threshold = num_ * (p / 100.0); - double sum = 0; - for (int b = 0; b < kNumBuckets; b++) { - sum += buckets_[b]; - if (sum >= threshold) { - // Scale linearly within this bucket - double left_point = (b == 0) ? 0 : kBucketLimit[b-1]; - double right_point = kBucketLimit[b]; - double left_sum = sum - buckets_[b]; - double right_sum = sum; - double pos = (threshold - left_sum) / (right_sum - left_sum); - double r = left_point + (right_point - left_point) * pos; - if (r < min_) r = min_; - if (r > max_) r = max_; - return r; - } - } - return max_; -} - -double Histogram::Average() const { - if (num_ == 0.0) return 0; - return sum_ / num_; -} - -double Histogram::StandardDeviation() const { - if (num_ == 0.0) return 0; - double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); - return sqrt(variance); -} - -std::string Histogram::ToString() const { - std::string r; - char buf[200]; - snprintf(buf, sizeof(buf), - "Count: %.0f Average: %.4f StdDev: %.2f\n", - num_, Average(), StandardDeviation()); - r.append(buf); - snprintf(buf, sizeof(buf), - "Min: %.4f Median: %.4f Max: %.4f\n", - (num_ == 0.0 ? 0.0 : min_), Median(), max_); - r.append(buf); - r.append("------------------------------------------------------\n"); - const double mult = 100.0 / num_; - double sum = 0; - for (int b = 0; b < kNumBuckets; b++) { - if (buckets_[b] <= 0.0) continue; - sum += buckets_[b]; - snprintf(buf, sizeof(buf), - "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ", - ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left - kBucketLimit[b], // right - buckets_[b], // count - mult * buckets_[b], // percentage - mult * sum); // cumulative percentage - r.append(buf); - - // Add hash marks based on percentage; 20 marks for 100%. - int marks = static_cast(20*(buckets_[b] / num_) + 0.5); - r.append(marks, '#'); - r.push_back('\n'); - } - return r; -} - -} diff --git a/leveldb/util/histogram.h b/leveldb/util/histogram.h deleted file mode 100644 index f72f122..0000000 --- a/leveldb/util/histogram.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ -#define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ - -#include - -namespace leveldb { - -class Histogram { - public: - Histogram() { } - ~Histogram() { } - - void Clear(); - void Add(double value); - - std::string ToString() const; - - private: - double min_; - double max_; - double num_; - double sum_; - double sum_squares_; - - enum { kNumBuckets = 154 }; - static const double kBucketLimit[kNumBuckets]; - double buckets_[kNumBuckets]; - - double Median() const; - double Percentile(double p) const; - double Average() const; - double StandardDeviation() const; -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ diff --git a/leveldb/util/logging.cc b/leveldb/util/logging.cc deleted file mode 100644 index 760d335..0000000 --- a/leveldb/util/logging.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/logging.h" - -#include -#include -#include -#include -#include "leveldb/env.h" -#include "leveldb/slice.h" - -namespace leveldb { - -void AppendNumberTo(std::string* str, uint64_t num) { - char buf[30]; - snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); - str->append(buf); -} - -void AppendEscapedStringTo(std::string* str, const Slice& value) { - for (size_t i = 0; i < value.size(); i++) { - char c = value[i]; - if (c >= ' ' && c <= '~') { - str->push_back(c); - } else { - char buf[10]; - snprintf(buf, sizeof(buf), "\\x%02x", - static_cast(c) & 0xff); - str->append(buf); - } - } -} - -std::string NumberToString(uint64_t num) { - std::string r; - AppendNumberTo(&r, num); - return r; -} - -std::string EscapeString(const Slice& value) { - std::string r; - AppendEscapedStringTo(&r, value); - return r; -} - -bool ConsumeChar(Slice* in, char c) { - if (!in->empty() && (*in)[0] == c) { - in->remove_prefix(1); - return true; - } else { - return false; - } -} - -bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { - uint64_t v = 0; - int digits = 0; - while (!in->empty()) { - char c = (*in)[0]; - if (c >= '0' && c <= '9') { - ++digits; - const int delta = (c - '0'); - static const uint64_t kMaxUint64 = ~static_cast(0); - if (v > kMaxUint64/10 || - (v == kMaxUint64/10 && delta > kMaxUint64%10)) { - // Overflow - return false; - } - v = (v * 10) + delta; - in->remove_prefix(1); - } else { - break; - } - } - *val = v; - return (digits > 0); -} - -} diff --git a/leveldb/util/logging.h b/leveldb/util/logging.h deleted file mode 100644 index 1cd0a4b..0000000 --- a/leveldb/util/logging.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Must not be included from any .h files to avoid polluting the namespace -// with macros. - -#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ -#define STORAGE_LEVELDB_UTIL_LOGGING_H_ - -#include -#include -#include -#include "port/port.h" - -namespace leveldb { - -class Slice; -class WritableFile; - -// Append a human-readable printout of "num" to *str -extern void AppendNumberTo(std::string* str, uint64_t num); - -// Append a human-readable printout of "value" to *str. -// Escapes any non-printable characters found in "value". -extern void AppendEscapedStringTo(std::string* str, const Slice& value); - -// Return a human-readable printout of "num" -extern std::string NumberToString(uint64_t num); - -// Return a human-readable version of "value". -// Escapes any non-printable characters found in "value". -extern std::string EscapeString(const Slice& value); - -// If *in starts with "c", advances *in past the first character and -// returns true. Otherwise, returns false. -extern bool ConsumeChar(Slice* in, char c); - -// Parse a human-readable number from "*in" into *value. On success, -// advances "*in" past the consumed number and sets "*val" to the -// numeric value. Otherwise, returns false and leaves *in in an -// unspecified state. -extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); - -} - -#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/leveldb/util/mutexlock.h b/leveldb/util/mutexlock.h deleted file mode 100644 index 05fe279..0000000 --- a/leveldb/util/mutexlock.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ -#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ - -#include "port/port.h" - -namespace leveldb { - -// Helper class that locks a mutex on construction and unlocks the mutex when -// the destructor of the MutexLock object is invoked. -// -// Typical usage: -// -// void MyClass::MyMethod() { -// MutexLock l(&mu_); // mu_ is an instance variable -// ... some complex code, possibly with multiple return paths ... -// } - -class MutexLock { - public: - explicit MutexLock(port::Mutex *mu) : mu_(mu) { - this->mu_->Lock(); - } - ~MutexLock() { this->mu_->Unlock(); } - - private: - port::Mutex *const mu_; - // No copying allowed - MutexLock(const MutexLock&); - void operator=(const MutexLock&); -}; - -} - - -#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ diff --git a/leveldb/util/options.cc b/leveldb/util/options.cc deleted file mode 100644 index 0ea5c98..0000000 --- a/leveldb/util/options.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/options.h" - -#include "leveldb/comparator.h" -#include "leveldb/env.h" - -namespace leveldb { - -Options::Options() - : comparator(BytewiseComparator()), - create_if_missing(false), - error_if_exists(false), - paranoid_checks(false), - env(Env::Default()), - info_log(NULL), - write_buffer_size(4<<20), - max_open_files(1000), - block_cache(NULL), - block_size(4096), - block_restart_interval(16), - compression(kSnappyCompression) { -} - - -} diff --git a/leveldb/util/random.h b/leveldb/util/random.h deleted file mode 100644 index d886b4e..0000000 --- a/leveldb/util/random.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ -#define STORAGE_LEVELDB_UTIL_RANDOM_H_ - -#include - -namespace leveldb { - -// A very simple random number generator. Not especially good at -// generating truly random bits, but good enough for our needs in this -// package. -class Random { - private: - uint32_t seed_; - public: - explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } - uint32_t Next() { - static const uint32_t M = 2147483647L; // 2^31-1 - static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 - // We are computing - // seed_ = (seed_ * A) % M, where M = 2^31-1 - // - // seed_ must not be zero or M, or else all subsequent computed values - // will be zero or M respectively. For all other values, seed_ will end - // up cycling through every number in [1,M-1] - uint64_t product = seed_ * A; - - // Compute (product % M) using the fact that ((x << 31) % M) == x. - seed_ = static_cast((product >> 31) + (product & M)); - // The first reduction may overflow by 1 bit, so we may need to - // repeat. mod == M is not possible; using > allows the faster - // sign-bit-based test. - if (seed_ > M) { - seed_ -= M; - } - return seed_; - } - // Returns a uniformly distributed value in the range [0..n-1] - // REQUIRES: n > 0 - uint32_t Uniform(int n) { return Next() % n; } - - // Randomly returns true ~"1/n" of the time, and false otherwise. - // REQUIRES: n > 0 - bool OneIn(int n) { return (Next() % n) == 0; } - - // Skewed: pick "base" uniformly from range [0,max_log] and then - // return "base" random bits. The effect is to pick a number in the - // range [0,2^max_log-1] with exponential bias towards smaller numbers. - uint32_t Skewed(int max_log) { - return Uniform(1 << Uniform(max_log + 1)); - } -}; - -} - -#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/leveldb/util/status.cc b/leveldb/util/status.cc deleted file mode 100644 index d9b7195..0000000 --- a/leveldb/util/status.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "port/port.h" -#include "leveldb/status.h" - -namespace leveldb { - -Status::Status(Code code, const Slice& msg, const Slice& msg2) { - assert(code != kOk); - state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); - if (!msg2.empty()) { - state_->second.append(": "); - state_->second.append(msg2.data(), msg2.size()); - } -} - -std::string Status::ToString() const { - if (state_ == NULL) { - return "OK"; - } else { - char tmp[30]; - const char* type; - switch (state_->first) { - case kOk: - type = "OK"; - break; - case kNotFound: - type = "NotFound"; - break; - case kCorruption: - type = "Corruption: "; - break; - case kNotSupported: - type = "Not implemented: "; - break; - case kInvalidArgument: - type = "Invalid argument: "; - break; - case kIOError: - type = "IO error: "; - break; - default: - snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", - static_cast(state_->first)); - type = tmp; - break; - } - std::string result(type); - if (!state_->second.empty()) { - result.append(state_->second); - } - return result; - } -} - -} diff --git a/leveldb/util/testharness.cc b/leveldb/util/testharness.cc deleted file mode 100644 index b686ac3..0000000 --- a/leveldb/util/testharness.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/testharness.h" - -#include -#include - -namespace leveldb { -namespace test { - -namespace { -struct Test { - const char* base; - const char* name; - void (*func)(); -}; -std::vector* tests; -} - -bool RegisterTest(const char* base, const char* name, void (*func)()) { - if (tests == NULL) { - tests = new std::vector; - } - Test t; - t.base = base; - t.name = name; - t.func = func; - tests->push_back(t); - return true; -} - -int RunAllTests() { - int num = 0; - if (tests != NULL) { - for (int i = 0; i < tests->size(); i++) { - const Test& t = (*tests)[i]; - fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); - (*t.func)(); - ++num; - } - } - fprintf(stderr, "==== PASSED %d tests\n", num); - return 0; -} - -std::string TmpDir() { - std::string dir; - Status s = Env::Default()->GetTestDirectory(&dir); - ASSERT_TRUE(s.ok()) << s.ToString(); - return dir; -} - -int RandomSeed() { - const char* env = getenv("TEST_RANDOM_SEED"); - int result = (env != NULL ? atoi(env) : 301); - if (result <= 0) { - result = 301; - } - return result; -} - -} -} diff --git a/leveldb/util/testharness.h b/leveldb/util/testharness.h deleted file mode 100644 index 13ab914..0000000 --- a/leveldb/util/testharness.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ -#define STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ - -#include -#include -#include -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "util/random.h" - -namespace leveldb { -namespace test { - -// Run all tests registered by the TEST() macro. -// Returns 0 if all tests pass. -// Dies or returns a non-zero value if some test fails. -extern int RunAllTests(); - -// Return the directory to use for temporary storage. -extern std::string TmpDir(); - -// Return a randomization seed for this run. Typically returns the -// same number on repeated invocations of this binary, but automated -// runs may be able to vary the seed. -extern int RandomSeed(); - -// An instance of Tester is allocated to hold temporary state during -// the execution of an assertion. -class Tester { - private: - bool ok_; - const char* fname_; - int line_; - std::stringstream ss_; - - public: - Tester(const char* f, int l) - : ok_(true), fname_(f), line_(l) { - } - - ~Tester() { - if (!ok_) { - fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); - exit(1); - } - } - - Tester& Is(bool b, const char* msg) { - if (!b) { - ss_ << " Assertion failure " << msg; - ok_ = false; - } - return *this; - } - - Tester& IsOk(const Status& s) { - if (!s.ok()) { - ss_ << " " << s.ToString(); - ok_ = false; - } - return *this; - } - -#define BINARY_OP(name,op) \ - template \ - Tester& name(const X& x, const Y& y) { \ - if (! (x op y)) { \ - ss_ << " failed: " << x << (" " #op " ") << y; \ - ok_ = false; \ - } \ - return *this; \ - } - - BINARY_OP(IsEq, ==) - BINARY_OP(IsNe, !=) - BINARY_OP(IsGe, >=) - BINARY_OP(IsGt, >) - BINARY_OP(IsLe, <=) - BINARY_OP(IsLt, <) -#undef BINARY_OP - - // Attach the specified value to the error message if an error has occurred - template - Tester& operator<<(const V& value) { - if (!ok_) { - ss_ << " " << value; - } - return *this; - } -}; - -#define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c) -#define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s)) -#define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) -#define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) -#define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) -#define ASSERT_GT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) -#define ASSERT_LE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) -#define ASSERT_LT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) - -#define TCONCAT(a,b) TCONCAT1(a,b) -#define TCONCAT1(a,b) a##b - -#define TEST(base,name) \ -class TCONCAT(_Test_,name) : public base { \ - public: \ - void _Run(); \ - static void _RunIt() { \ - TCONCAT(_Test_,name) t; \ - t._Run(); \ - } \ -}; \ -bool TCONCAT(_Test_ignored_,name) = \ - ::leveldb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ -void TCONCAT(_Test_,name)::_Run() - -// Register the specified test. Typically not used directly, but -// invoked via the macro expansion of TEST. -extern bool RegisterTest(const char* base, const char* name, void (*func)()); - - -} -} - -#endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ diff --git a/leveldb/util/testutil.cc b/leveldb/util/testutil.cc deleted file mode 100644 index 8d6cf3c..0000000 --- a/leveldb/util/testutil.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/testutil.h" - -#include "util/random.h" - -namespace leveldb { -namespace test { - -Slice RandomString(Random* rnd, int len, std::string* dst) { - dst->resize(len); - for (int i = 0; i < len; i++) { - (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' - } - return Slice(*dst); -} - -std::string RandomKey(Random* rnd, int len) { - // Make sure to generate a wide variety of characters so we - // test the boundary conditions for short-key optimizations. - static const char kTestChars[] = { - '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' - }; - std::string result; - for (int i = 0; i < len; i++) { - result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; - } - return result; -} - - -extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst) { - int raw = static_cast(len * compressed_fraction); - if (raw < 1) raw = 1; - std::string raw_data; - RandomString(rnd, raw, &raw_data); - - // Duplicate the random data until we have filled "len" bytes - dst->clear(); - while (dst->size() < len) { - dst->append(raw_data); - } - dst->resize(len); - return Slice(*dst); -} - -} -} diff --git a/leveldb/util/testutil.h b/leveldb/util/testutil.h deleted file mode 100644 index a150c1a..0000000 --- a/leveldb/util/testutil.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_ -#define STORAGE_LEVELDB_UTIL_TESTUTIL_H_ - -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "util/random.h" - -namespace leveldb { -namespace test { - -// Store in *dst a random string of length "len" and return a Slice that -// references the generated data. -extern Slice RandomString(Random* rnd, int len, std::string* dst); - -// Return a random key with the specified length that may contain interesting -// characters (e.g. \x00, \xff, etc.). -extern std::string RandomKey(Random* rnd, int len); - -// Store in *dst a string of length "len" that will compress to -// "N*compressed_fraction" bytes and return a Slice that references -// the generated data. -extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst); - -// A wrapper that allows injection of errors. -class ErrorEnv : public EnvWrapper { - public: - bool writable_file_error_; - int num_writable_file_errors_; - - ErrorEnv() : EnvWrapper(Env::Default()), - writable_file_error_(false), - num_writable_file_errors_(0) { } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - if (writable_file_error_) { - ++num_writable_file_errors_; - *result = NULL; - return Status::IOError(fname, "fake error"); - } - return target()->NewWritableFile(fname, result); - } -}; - -} -} - -#endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_ diff --git a/port/README b/port/README new file mode 100644 index 0000000..422563e --- /dev/null +++ b/port/README @@ -0,0 +1,10 @@ +This directory contains interfaces and implementations that isolate the +rest of the package from platform details. + +Code in the rest of the package includes "port.h" from this directory. +"port.h" in turn includes a platform specific "port_.h" file +that provides the platform specific implementation. + +See port_posix.h for an example of what must be provided in a platform +specific header file. + diff --git a/port/port.h b/port/port.h new file mode 100644 index 0000000..816826b --- /dev/null +++ b/port/port.h @@ -0,0 +1,21 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_PORT_H_ +#define STORAGE_LEVELDB_PORT_PORT_H_ + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(LEVELDB_PLATFORM_POSIX) +# include "port/port_posix.h" +#elif defined(LEVELDB_PLATFORM_CHROMIUM) +# include "port/port_chromium.h" +#elif defined(LEVELDB_PLATFORM_ANDROID) +# include "port/port_android.h" +#endif + +#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_android.cc b/port/port_android.cc new file mode 100644 index 0000000..240e9ca --- /dev/null +++ b/port/port_android.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_android.h" + +#include + +extern "C" { +size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { + return fread(a, b, c, d); +} + +size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { + return fwrite(a, b, c, d); +} + +int fflush_unlocked(FILE *f) { + return fflush(f); +} + +int fdatasync(int fd) { + return fsync(fd); +} +} + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { + PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); +} + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal(){ + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/port/port_android.h b/port/port_android.h new file mode 100644 index 0000000..8680951 --- /dev/null +++ b/port/port_android.h @@ -0,0 +1,158 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ +#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ + +#include +#include +#include +#include +#include +#include +#include + +// Collapse the plethora of ARM flavors available to an easier to manage set +// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto +#if defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7A__) +#define ARMV6_OR_7 1 +#endif + +extern "C" { + size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); + size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); + int fflush_unlocked(FILE *f); + int fdatasync (int fd); +} + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { + //TODO(gabor): How can I implement this? + } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + Mutex* mu_; + pthread_cond_t cv_; +}; + +#ifndef ARMV6_OR_7 +// On ARM chipsets = V6 +#ifdef ARMV6_OR_7 + __asm__ __volatile__("dmb" : : : "memory"); +#else + pLinuxKernelMemoryBarrier(); +#endif + } + + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + void* r = rep_; + MemoryBarrier(); + return r; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + rep_ = v; + } + inline void* NoBarrier_Load() const { + void* r = rep_; + return r; + } + inline void NoBarrier_Store(void* v) { + rep_ = v; + } +}; + +// TODO(gabor): Implement compress +inline bool Snappy_Compress( + const char* input, + size_t input_length, + std::string* output) { + return false; +} + +// TODO(gabor): Implement uncompress +inline bool Snappy_Uncompress( + const char* input_data, + size_t input_length, + std::string* output) { + return false; +} + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + SHA1_CTX sha1_ctx; + SHA1Init(&sha1_ctx); + SHA1Update(&sha1_ctx, (const u_char*)data, len); + SHA1Final((u_char*)hash_array, &sha1_ctx); +} + +inline uint64_t ThreadIdentifier() { + pthread_t tid = pthread_self(); + uint64_t r = 0; + memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); + return r; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/port/port_chromium.cc b/port/port_chromium.cc new file mode 100644 index 0000000..2ab49b9 --- /dev/null +++ b/port/port_chromium.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_chromium.h" + +#include "util/logging.h" + +#if defined(USE_SNAPPY) +# include "third_party/snappy/src/snappy.h" +#endif + +namespace leveldb { +namespace port { + +Mutex::Mutex() { +} + +Mutex::~Mutex() { +} + +void Mutex::Lock() { + mu_.Acquire(); +} + +void Mutex::Unlock() { + mu_.Release(); +} + +void Mutex::AssertHeld() { + mu_.AssertAcquired(); +} + +CondVar::CondVar(Mutex* mu) + : cv_(&mu->mu_) { +} + +CondVar::~CondVar() { } + +void CondVar::Wait() { + cv_.Wait(); +} + +void CondVar::Signal(){ + cv_.Signal(); +} + +void CondVar::SignalAll() { + cv_.Broadcast(); +} + +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { +#if defined(USE_SNAPPY) + output->resize(snappy::MaxCompressedLength(input_length)); + size_t outlen; + snappy::RawCompress(input, input_length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#else + return false; +#endif +} + +bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { +#if defined(USE_SNAPPY) + size_t ulength; + if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { + return false; + } + output->resize(ulength); + return snappy::RawUncompress(input_data, input_length, &(*output)[0]); +#else + return false; +#endif +} + +} +} diff --git a/port/port_chromium.h b/port/port_chromium.h new file mode 100644 index 0000000..e349f8f --- /dev/null +++ b/port/port_chromium.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ +#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ + +#include +#include +#include +#include "base/atomicops.h" +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/sha1.h" +#include "base/synchronization/condition_variable.h" +#include "base/synchronization/lock.h" + +// Linux's ThreadIdentifier() needs this. +#if defined(OS_LINUX) +# include +#endif + +#if defined(OS_WIN) +#define snprintf _snprintf +#define va_copy(a, b) do { (a) = (b); } while (0) +#endif + +namespace leveldb { +namespace port { + +// Chromium only supports little endian. +static const bool kLittleEndian = true; + +class Mutex { + public: + Mutex(); + ~Mutex(); + void Lock(); + void Unlock(); + void AssertHeld(); + + private: + base::Lock mu_; + + friend class CondVar; + DISALLOW_COPY_AND_ASSIGN(Mutex); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + + private: + base::ConditionVariable cv_; + + DISALLOW_COPY_AND_ASSIGN(CondVar); +}; + +class AtomicPointer { + private: + typedef base::subtle::AtomicWord Rep; + Rep rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} + inline void* Acquire_Load() const { + return reinterpret_cast(::base::subtle::Acquire_Load(&rep_)); + } + inline void Release_Store(void* v) { + ::base::subtle::Release_Store(&rep_, reinterpret_cast(v)); + } + inline void* NoBarrier_Load() const { + return reinterpret_cast(::base::subtle::NoBarrier_Load(&rep_)); + } + inline void NoBarrier_Store(void* v) { + ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast(v)); + } +}; + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + return ::base::SHA1HashBytes(reinterpret_cast(data), + len, + reinterpret_cast(hash_array)); +} + +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); +bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output); + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ diff --git a/port/port_example.h b/port/port_example.h new file mode 100644 index 0000000..cf72617 --- /dev/null +++ b/port/port_example.h @@ -0,0 +1,120 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ +#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ + +namespace leveldb { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// A type that holds a pointer that can be read or written atomically +// (i.e., without word-tearing.) +class AtomicPointer { + private: + intptr_t rep_; + public: + // Initialize to arbitrary value + AtomicPointer(); + + // Initialize to hold v + explicit AtomicPointer(void* v) : rep_(v) { } + + // Read and return the stored pointer with the guarantee that no + // later memory access (read or write) by this thread can be + // reordered ahead of this read. + void* Acquire_Load() const; + + // Set v as the stored pointer with the guarantee that no earlier + // memory access (read or write) by this thread can be reordered + // after this store. + void Release_Store(void* v); + + // Read the stored pointer with no ordering guarantees. + void* NoBarrier_Load() const; + + // Set va as the stored pointer with no ordering guarantees. + void NoBarrier_Store(void* v); +}; + +// ------------------ Checksumming ------------------- + +// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]" +extern void SHA1_Hash(const char* data, size_t len, char* hash_array); + +// ------------------ Compression ------------------- + +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); + +// Attempt to snappy uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output); + +// ------------------ Miscellaneous ------------------- + +// If heap profiling is not supported, returns false. +// Else repeatedly calls (*func)(arg, data, n) and then returns true. +// The concatenation of all "data[0,n-1]" fragments is the heap profile. +extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/port/port_posix.cc b/port/port_posix.cc new file mode 100644 index 0000000..e75da8b --- /dev/null +++ b/port/port_posix.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_posix.h" + +#include +#include +#include +#include "util/logging.h" + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } + +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/port/port_posix.h b/port/port_posix.h new file mode 100644 index 0000000..7adbc01 --- /dev/null +++ b/port/port_posix.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ + +#include +#include +#include +#include +#include +#include +#include "port/sha1_portable.h" + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +// Storage for a lock-free pointer +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { + SHA1_Hash_Portable(data, len, hash_array); +} + +// TODO(gabor): Implement actual compress +inline bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { + return false; +} + +// TODO(gabor): Implement actual uncompress +inline bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { + return false; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc new file mode 100644 index 0000000..8fa7277 --- /dev/null +++ b/port/sha1_portable.cc @@ -0,0 +1,298 @@ +// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This module provides a slow but portable implementation of +// the SHA1 hash function. +// +// It is adapted from free code written by Paul E. Jones +// . See http://www.packetizer.com/security/sha1/ +// +// The license for the original code is: +/* + Copyright (C) 1998, 2009 + Paul E. Jones + + Freeware Public License (FPL) + + This software is licensed as "freeware." Permission to distribute + this software in source and binary forms, including incorporation + into other products, is hereby granted without a fee. THIS SOFTWARE + IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD + LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER + DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA + OR DATA BEING RENDERED INACCURATE. +*/ + +#include "port/sha1_portable.h" +#include +#include +#include + +namespace leveldb { +namespace port { + +/* + * Description: + * This class implements the Secure Hashing Standard as defined + * in FIPS PUB 180-1 published April 17, 1995. + */ + +/* + * This structure will hold context information for the hashing + * operation + */ +typedef struct SHA1Context { + unsigned Message_Digest[5]; /* Message Digest (output) */ + + unsigned Length_Low; /* Message length in bits */ + unsigned Length_High; /* Message length in bits */ + + unsigned char Message_Block[64]; /* 512-bit message blocks */ + int Message_Block_Index; /* Index into message block array */ + + bool Computed; /* Is the digest computed? */ + bool Corrupted; /* Is the message digest corruped? */ +} SHA1Context; + +/* + * Portability Issues: + * SHA-1 is defined in terms of 32-bit "words". This code was + * written with the expectation that the processor has at least + * a 32-bit machine word size. If the machine word size is larger, + * the code should still function properly. One caveat to that + * is that the input functions taking characters and character + * arrays assume that only 8 bits of information are stored in each + * character. + */ + +/* + * Define the circular shift macro + */ +#define SHA1CircularShift(bits,word) \ + ((((word) << (bits)) & 0xFFFFFFFF) | \ + ((word) >> (32-(bits)))) + +/* Function prototypes */ +static void SHA1ProcessMessageBlock(SHA1Context *); +static void SHA1PadMessage(SHA1Context *); + +// Initialize the SHA1Context in preparation for computing a new +// message digest. +static void SHA1Reset(SHA1Context* context) { + context->Length_Low = 0; + context->Length_High = 0; + context->Message_Block_Index = 0; + + context->Message_Digest[0] = 0x67452301; + context->Message_Digest[1] = 0xEFCDAB89; + context->Message_Digest[2] = 0x98BADCFE; + context->Message_Digest[3] = 0x10325476; + context->Message_Digest[4] = 0xC3D2E1F0; + + context->Computed = false; + context->Corrupted = false; +} + +// This function will return the 160-bit message digest into the +// Message_Digest array within the SHA1Context provided +static bool SHA1Result(SHA1Context *context) { + if (context->Corrupted) { + return false; + } + + if (!context->Computed) { + SHA1PadMessage(context); + context->Computed = true; + } + return true; +} + +// This function accepts an array of bytes as the next portion of +// the message. +static void SHA1Input(SHA1Context *context, + const unsigned char *message_array, + unsigned length) { + if (!length) return; + + if (context->Computed || context->Corrupted) { + context->Corrupted = true; + return; + } + + while(length-- && !context->Corrupted) { + context->Message_Block[context->Message_Block_Index++] = + (*message_array & 0xFF); + + context->Length_Low += 8; + /* Force it to 32 bits */ + context->Length_Low &= 0xFFFFFFFF; + if (context->Length_Low == 0) { + context->Length_High++; + /* Force it to 32 bits */ + context->Length_High &= 0xFFFFFFFF; + if (context->Length_High == 0) + { + /* Message is too long */ + context->Corrupted = true; + } + } + + if (context->Message_Block_Index == 64) + { + SHA1ProcessMessageBlock(context); + } + + message_array++; + } +} + +// This function will process the next 512 bits of the message stored +// in the Message_Block array. +static void SHA1ProcessMessageBlock(SHA1Context *context) { + const unsigned K[] = // Constants defined in SHA-1 + { + 0x5A827999, + 0x6ED9EBA1, + 0x8F1BBCDC, + 0xCA62C1D6 + }; + int t; // Loop counter + unsigned temp; // Temporary word value + unsigned W[80]; // Word sequence + unsigned A, B, C, D, E; // Word buffers + + // Initialize the first 16 words in the array W + for(t = 0; t < 16; t++) { + W[t] = ((unsigned) context->Message_Block[t * 4]) << 24; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8; + W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]); + } + + for(t = 16; t < 80; t++) { + W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]); + } + + A = context->Message_Digest[0]; + B = context->Message_Digest[1]; + C = context->Message_Digest[2]; + D = context->Message_Digest[3]; + E = context->Message_Digest[4]; + + for(t = 0; t < 20; t++) { + temp = SHA1CircularShift(5,A) + + ((B & C) | ((~B) & D)) + E + W[t] + K[0]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 20; t < 40; t++) { + temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 40; t < 60; t++) { + temp = SHA1CircularShift(5,A) + + ((B & C) | (B & D) | (C & D)) + E + W[t] + K[2]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 60; t < 80; t++) { + temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3]; + temp &= 0xFFFFFFFF; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF; + context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF; + context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF; + context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF; + context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF; + + context->Message_Block_Index = 0; +} + +// According to the standard, the message must be padded to an even +// 512 bits. The first padding bit must be a '1'. The last 64 bits +// represent the length of the original message. All bits in between +// should be 0. This function will pad the message according to those +// rules by filling the Message_Block array accordingly. It will also +// call SHA1ProcessMessageBlock() appropriately. When it returns, it +// can be assumed that the message digest has been computed. +static void SHA1PadMessage(SHA1Context *context) { + // Check to see if the current message block is too small to hold + // the initial padding bits and length. If so, we will pad the + // block, process it, and then continue padding into a second block. + if (context->Message_Block_Index > 55) { + context->Message_Block[context->Message_Block_Index++] = 0x80; + while(context->Message_Block_Index < 64) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + + SHA1ProcessMessageBlock(context); + + while(context->Message_Block_Index < 56) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + } else { + context->Message_Block[context->Message_Block_Index++] = 0x80; + while(context->Message_Block_Index < 56) { + context->Message_Block[context->Message_Block_Index++] = 0; + } + } + + // Store the message length as the last 8 octets + context->Message_Block[56] = (context->Length_High >> 24) & 0xFF; + context->Message_Block[57] = (context->Length_High >> 16) & 0xFF; + context->Message_Block[58] = (context->Length_High >> 8) & 0xFF; + context->Message_Block[59] = (context->Length_High) & 0xFF; + context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF; + context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF; + context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF; + context->Message_Block[63] = (context->Length_Low) & 0xFF; + + SHA1ProcessMessageBlock(context); +} + + +void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) { + SHA1Context context; + SHA1Reset(&context); + SHA1Input(&context, reinterpret_cast(data), len); + bool ok = SHA1Result(&context); + if (!ok) { + fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n"); + exit(1); + } + for (int i = 0; i < 5; i++) { + uint32_t value = context.Message_Digest[i]; + hash_array[i*4 + 0] = (value >> 24) & 0xff; + hash_array[i*4 + 1] = (value >> 16) & 0xff; + hash_array[i*4 + 2] = (value >> 8) & 0xff; + hash_array[i*4 + 3] = value & 0xff; + } +} + +} +} diff --git a/port/sha1_portable.h b/port/sha1_portable.h new file mode 100644 index 0000000..31db305 --- /dev/null +++ b/port/sha1_portable.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ +#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ + +#include + +namespace leveldb { +namespace port { + +// Compute the SHA1 hash value of "data[0..len-1]" and store it in +// "hash_array[0..19]". hash_array must have 20 bytes of space available. +// +// This function is portable but may not be as fast as a version +// optimized for your platform. It is provided as a default method +// that can be used when porting leveldb to a new platform if no +// better SHA1 hash implementation is available. +void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array); + +} +} + +#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_ diff --git a/port/sha1_test.cc b/port/sha1_test.cc new file mode 100644 index 0000000..b182e67 --- /dev/null +++ b/port/sha1_test.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port.h" +#include "util/testharness.h" + +namespace leveldb { +namespace port { + +class SHA1 { }; + +static std::string TestSHA1(const char* data, size_t len) { + char hash_val[20]; + SHA1_Hash(data, len, hash_val); + char buf[41]; + for (int i = 0; i < 20; i++) { + snprintf(buf + i * 2, 41 - i * 2, + "%02x", + static_cast(static_cast( + hash_val[i]))); + } + return std::string(buf, 40); +} + +TEST(SHA1, Simple) { + ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0)); + ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5)); + std::string x(10000, 'x'); + ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75", + TestSHA1(x.data(), x.size())); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/port/win/stdint.h b/port/win/stdint.h new file mode 100644 index 0000000..39edd0d --- /dev/null +++ b/port/win/stdint.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// MSVC didn't ship with this file until the 2010 version. + +#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ +#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ + +#if !defined(_MSC_VER) +#error This file should only be included when compiling with MSVC. +#endif + +// Define C99 equivalent types. +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long long int64_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/table/block.cc b/table/block.cc new file mode 100644 index 0000000..0525d2d --- /dev/null +++ b/table/block.cc @@ -0,0 +1,261 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block.h" + +#include +#include +#include "leveldb/comparator.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace leveldb { + +inline uint32_t Block::NumRestarts() const { + assert(size_ >= 2*sizeof(uint32_t)); + return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); +} + +Block::Block(const char* data, size_t size) + : data_(data), + size_(size) { + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + } +} + +Block::~Block() { + delete[] data_; +} + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns NULL. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +static inline const char* DecodeEntry(const char* p, const char* limit, + uint32_t* shared, + uint32_t* non_shared, + uint32_t* value_length) { + if (limit - p < 3) return NULL; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL; + if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; + } + + if (limit - p < (*non_shared + *value_length)) return NULL; + return p; +} + +class Block::Iter : public Iterator { + private: + const Comparator* const comparator_; + const char* const data_; // underlying block contents + uint32_t const restarts_; // Offset of restart array (list of fixed32) + uint32_t const num_restarts_; // Number of uint32_t entries in restart array + + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + uint32_t restart_index_; // Index of restart block in which current_ falls + std::string key_; + Slice value_; + Status status_; + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + return (value_.data() + value_.size()) - data_; + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + public: + Iter(const Comparator* comparator, + const char* data, + uint32_t restarts, + uint32_t num_restarts) + : comparator_(comparator), + data_(data), + restarts_(restarts), + num_restarts_(num_restarts), + current_(restarts_), + restart_index_(num_restarts_) { + assert(num_restarts_ > 0); + } + + virtual bool Valid() const { return current_ < restarts_; } + virtual Status status() const { return status_; } + virtual Slice key() const { + assert(Valid()); + return key_; + } + virtual Slice value() const { + assert(Valid()); + return value_; + } + + virtual void Next() { + assert(Valid()); + ParseNextKey(); + } + + virtual void Prev() { + assert(Valid()); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + do { + // Loop until end of current entry hits the start of original entry + } while (ParseNextKey() && NextEntryOffset() < original); + } + + virtual void Seek(const Slice& target) { + // Binary search in restart array to find the first restart point + // with a key >= target + uint32_t left = 0; + uint32_t right = num_restarts_ - 1; + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared, value_length; + const char* key_ptr = DecodeEntry(data_ + region_offset, + data_ + restarts_, + &shared, &non_shared, &value_length); + if (key_ptr == NULL || (shared != 0)) { + CorruptionError(); + return; + } + Slice mid_key(key_ptr, non_shared); + if (Compare(mid_key, target) < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } + } + + // Linear search (within restart block) for first key >= target + SeekToRestartPoint(left); + while (true) { + if (!ParseNextKey()) { + return; + } + if (Compare(key_, target) >= 0) { + return; + } + } + } + + virtual void SeekToFirst() { + SeekToRestartPoint(0); + ParseNextKey(); + } + + virtual void SeekToLast() { + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } + } + + private: + void CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.clear(); + value_.clear(); + } + + bool ParseNextKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (p == NULL || key_.size() < shared) { + CorruptionError(); + return false; + } else { + key_.resize(shared); + key_.append(p, non_shared); + value_ = Slice(p + non_shared, value_length); + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + return true; + } + } +}; + +Iterator* Block::NewIterator(const Comparator* cmp) { + if (size_ < 2*sizeof(uint32_t)) { + return NewErrorIterator(Status::Corruption("bad block contents")); + } + const uint32_t num_restarts = NumRestarts(); + if (num_restarts == 0) { + return NewEmptyIterator(); + } else { + return new Iter(cmp, data_, restart_offset_, num_restarts); + } +} + +} diff --git a/table/block.h b/table/block.h new file mode 100644 index 0000000..cdf0598 --- /dev/null +++ b/table/block.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_ +#define STORAGE_LEVELDB_TABLE_BLOCK_H_ + +#include +#include +#include "leveldb/iterator.h" + +namespace leveldb { + +class Comparator; + +class Block { + public: + // Initialize the block with the specified contents. + // Takes ownership of data[] and will delete[] it when done. + Block(const char* data, size_t size); + + ~Block(); + + size_t size() const { return size_; } + Iterator* NewIterator(const Comparator* comparator); + + private: + uint32_t NumRestarts() const; + + const char* data_; + size_t size_; + uint32_t restart_offset_; // Offset in data_ of restart array + + // No copying allowed + Block(const Block&); + void operator=(const Block&); + + class Iter; +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ diff --git a/table/block_builder.cc b/table/block_builder.cc new file mode 100644 index 0000000..ae18b36 --- /dev/null +++ b/table/block_builder.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_builder.h" + +#include +#include +#include "leveldb/comparator.h" +#include "leveldb/table_builder.h" +#include "util/coding.h" + +namespace leveldb { + +BlockBuilder::BlockBuilder(const Options* options) + : options_(options), + restarts_(), + counter_(0), + finished_(false) { + assert(options->block_restart_interval >= 1); + restarts_.push_back(0); // First restart point is at offset 0 +} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + counter_ = 0; + finished_ = false; + last_key_.clear(); +} + +size_t BlockBuilder::CurrentSizeEstimate() const { + return (buffer_.size() + // Raw data buffer + restarts_.size() * sizeof(uint32_t) + // Restart array + sizeof(uint32_t)); // Restart array length +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (int i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + PutFixed32(&buffer_, restarts_.size()); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value) { + Slice last_key_piece(last_key_); + assert(!finished_); + assert(counter_ <= options_->block_restart_interval); + assert(buffer_.empty() // No values yet? + || options_->comparator->Compare(key, last_key_piece) > 0); + size_t shared = 0; + if (counter_ < options_->block_restart_interval) { + // See how much sharing to do with previous string + const size_t min_length = std::min(last_key_piece.size(), key.size()); + while ((shared < min_length) && (last_key_[shared] == key[shared])) { + shared++; + } + } else { + // Restart compression + restarts_.push_back(buffer_.size()); + counter_ = 0; + } + const size_t non_shared = key.size() - shared; + + // Add "" to buffer_ + PutVarint32(&buffer_, shared); + PutVarint32(&buffer_, non_shared); + PutVarint32(&buffer_, value.size()); + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + buffer_.append(value.data(), value.size()); + + // Update state + last_key_.resize(shared); + last_key_.append(key.data() + shared, non_shared); + assert(Slice(last_key_) == key); + counter_++; +} + +} diff --git a/table/block_builder.h b/table/block_builder.h new file mode 100644 index 0000000..bf92a0f --- /dev/null +++ b/table/block_builder.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ +#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ + +#include + +#include +#include "leveldb/slice.h" + +namespace leveldb { + +struct Options; + +class BlockBuilder { + public: + explicit BlockBuilder(const Options* options); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been callled since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + size_t CurrentSizeEstimate() const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { + return buffer_.empty(); + } + + private: + const Options* options_; + std::string buffer_; // Destination buffer + std::vector restarts_; // Restart points + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + + // No copying allowed + BlockBuilder(const BlockBuilder&); + void operator=(const BlockBuilder&); +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ diff --git a/table/format.cc b/table/format.cc new file mode 100644 index 0000000..8c6b0f3 --- /dev/null +++ b/table/format.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include "leveldb/env.h" +#include "port/port.h" +#include "table/block.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace leveldb { + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, offset_); + PutVarint64(dst, size_); +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && + GetVarint64(input, &size_)) { + return Status::OK(); + } else { + return Status::Corruption("bad block handle"); + } +} + +void Footer::EncodeTo(std::string* dst) const { +#ifndef NDEBUG + const size_t original_size = dst->size(); +#endif + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding + PutFixed32(dst, static_cast(kTableMagicNumber)); + PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); + assert(dst->size() == original_size + kEncodedLength); +} + +Status Footer::DecodeFrom(Slice* input) { + const char* magic_ptr = input->data() + kEncodedLength - 8; + const uint32_t magic_lo = DecodeFixed32(magic_ptr); + const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); + const uint64_t magic = ((static_cast(magic_hi) << 32) | + (static_cast(magic_lo))); + if (magic != kTableMagicNumber) { + return Status::InvalidArgument("not an sstable (bad magic number)"); + } + + Status result = metaindex_handle_.DecodeFrom(input); + if (result.ok()) { + result = index_handle_.DecodeFrom(input); + } + if (result.ok()) { + // We skip over any leftover data (just padding for now) in "input" + const char* end = magic_ptr + 8; + *input = Slice(end, input->data() + input->size() - end); + } + return result; +} + +Status ReadBlock(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** block) { + *block = NULL; + + // Read the block contents as well as the type/crc footer. + // See table_builder.cc for the code that built this structure. + size_t n = handle.size(); + char* buf = new char[n + kBlockTrailerSize]; + Slice contents; + Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); + if (!s.ok()) { + delete[] buf; + return s; + } + if (contents.size() != n + kBlockTrailerSize) { + delete[] buf; + return Status::Corruption("truncated block read"); + } + + // Check the crc of the type and the block contents + const char* data = contents.data(); // Pointer to where Read put the data + if (options.verify_checksums) { + const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); + const uint32_t actual = crc32c::Value(data, n + 1); + if (actual != crc) { + delete[] buf; + s = Status::Corruption("block checksum mismatch"); + return s; + } + } + + switch (data[n]) { + case kNoCompression: + if (data != buf) { + // File implementation gave us pointer to some other data. + // Copy into buf[]. + memcpy(buf, data, n + kBlockTrailerSize); + } + + // Ok + break; + case kSnappyCompression: { + std::string decompressed; + if (!port::Snappy_Uncompress(data, n, &decompressed)) { + delete[] buf; + s = Status::Corruption("corrupted compressed block contents"); + return s; + } + delete[] buf; // Done with uncompressed data + buf = new char[decompressed.size()]; + memcpy(buf, decompressed.data(), decompressed.size()); + n = decompressed.size(); + break; + } + default: + delete[] buf; + return Status::Corruption("bad block type"); + } + + *block = new Block(buf, n); // Block takes ownership of buf[] + return Status::OK(); +} + +} diff --git a/table/format.h b/table/format.h new file mode 100644 index 0000000..a6ab964 --- /dev/null +++ b/table/format.h @@ -0,0 +1,103 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_ +#define STORAGE_LEVELDB_TABLE_FORMAT_H_ + +#include +#include +#include "leveldb/slice.h" +#include "leveldb/status.h" +#include "leveldb/table_builder.h" + +namespace leveldb { + +class Block; +class RandomAccessFile; +struct ReadOptions; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + BlockHandle(); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t offset) { offset_ = offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t size) { size_ = size; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Maximum encoding length of a BlockHandle + enum { kMaxEncodedLength = 10 + 10 }; + + private: + uint64_t offset_; + uint64_t size_; +}; + +// Footer encapsulates the fixed information stored at the tail +// end of every table file. +class Footer { + public: + Footer() { } + + // The block handle for the metaindex block of the table + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } + + // The block handle for the index block of the table + const BlockHandle& index_handle() const { + return index_handle_; + } + void set_index_handle(const BlockHandle& h) { + index_handle_ = h; + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // Encoded length of a Footer. Note that the serialization of a + // Footer will always occupy exactly this many bytes. It consists + // of two block handles and a magic number. + enum { + kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 + }; + + private: + BlockHandle metaindex_handle_; + BlockHandle index_handle_; +}; + +// kTableMagicNumber was picked by running +// echo http://code.google.com/p/leveldb/ | sha1sum +// and taking the leading 64 bits. +static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; + +// 1-byte type + 32-bit crc +static const size_t kBlockTrailerSize = 5; + +// Read the block identified by "handle" from "file". On success, +// store a pointer to the heap-allocated result in *block and return +// OK. On failure store NULL in *block and return non-OK. +extern Status ReadBlock(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + Block** block); + +// Implementation details follow. Clients should ignore, + +inline BlockHandle::BlockHandle() + : offset_(~static_cast(0)), + size_(~static_cast(0)) { +} + +} + +#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ diff --git a/table/iterator.cc b/table/iterator.cc new file mode 100644 index 0000000..4ddd55f --- /dev/null +++ b/table/iterator.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/iterator.h" +#include "util/logging.h" + +namespace leveldb { + +Iterator::Iterator() { + cleanup_.function = NULL; + cleanup_.next = NULL; +} + +Iterator::~Iterator() { + if (cleanup_.function != NULL) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != NULL; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != NULL); + Cleanup* c; + if (cleanup_.function == NULL) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +} diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h new file mode 100644 index 0000000..158d3a7 --- /dev/null +++ b/table/iterator_wrapper.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ +#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ + +namespace leveldb { + +// A internal wrapper class with an interface similar to Iterator that +// caches the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +class IteratorWrapper { + private: + Iterator* iter_; + bool valid_; + Slice key_; + public: + IteratorWrapper(): iter_(NULL), valid_(false) { } + explicit IteratorWrapper(Iterator* iter): iter_(NULL) { + Set(iter); + } + ~IteratorWrapper() { delete iter_; } + Iterator* iter() const { return iter_; } + + // Takes ownership of "iter" and will delete it when destroyed, or + // when Set() is invoked again. + void Set(Iterator* iter) { + delete iter_; + iter_ = iter; + if (iter_ == NULL) { + valid_ = false; + } else { + Update(); + } + } + + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + Slice value() const { assert(Valid()); return iter_->value(); } + // Methods below require iter() != NULL + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + key_ = iter_->key(); + } + } +}; + +} + + +#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ diff --git a/table/merger.cc b/table/merger.cc new file mode 100644 index 0000000..6ce06bb --- /dev/null +++ b/table/merger.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merger.h" + +#include "leveldb/comparator.h" +#include "leveldb/iterator.h" +#include "table/iterator_wrapper.h" + +namespace leveldb { + +namespace { +class MergingIterator : public Iterator { + public: + MergingIterator(const Comparator* comparator, Iterator** children, int n) + : comparator_(comparator), + children_(new IteratorWrapper[n]), + n_(n), + current_(NULL), + direction_(kForward) { + for (int i = 0; i < n; i++) { + children_[i].Set(children[i]); + } + } + + virtual ~MergingIterator() { + delete[] children_; + } + + virtual bool Valid() const { + return (current_ != NULL); + } + + virtual void SeekToFirst() { + for (int i = 0; i < n_; i++) { + children_[i].SeekToFirst(); + } + FindSmallest(); + direction_ = kForward; + } + + virtual void SeekToLast() { + for (int i = 0; i < n_; i++) { + children_[i].SeekToLast(); + } + FindLargest(); + direction_ = kReverse; + } + + virtual void Seek(const Slice& target) { + for (int i = 0; i < n_; i++) { + children_[i].Seek(target); + } + FindSmallest(); + direction_ = kForward; + } + + virtual void Next() { + assert(Valid()); + + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all of the non-current_ children since current_ is + // the smallest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kForward) { + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child != current_) { + child->Seek(key()); + if (child->Valid() && + comparator_->Compare(key(), child->key()) == 0) { + child->Next(); + } + } + } + direction_ = kForward; + } + + current_->Next(); + FindSmallest(); + } + + virtual void Prev() { + assert(Valid()); + + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all of the non-current_ children since current_ is + // the largest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kReverse) { + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child != current_) { + child->Seek(key()); + if (child->Valid()) { + // Child is at first entry >= key(). Step back one to be < key() + child->Prev(); + } else { + // Child has no entries >= key(). Position at last entry. + child->SeekToLast(); + } + } + } + direction_ = kReverse; + } + + current_->Prev(); + FindLargest(); + } + + virtual Slice key() const { + assert(Valid()); + return current_->key(); + } + + virtual Slice value() const { + assert(Valid()); + return current_->value(); + } + + virtual Status status() const { + Status status; + for (int i = 0; i < n_; i++) { + status = children_[i].status(); + if (!status.ok()) { + break; + } + } + return status; + } + + private: + void FindSmallest(); + void FindLargest(); + + // We might want to use a heap in case there are lots of children. + // For now we use a simple array since we expect a very small number + // of children in leveldb. + const Comparator* comparator_; + IteratorWrapper* children_; + int n_; + IteratorWrapper* current_; + + // Which direction is the iterator moving? + enum Direction { + kForward, + kReverse + }; + Direction direction_; +}; + +void MergingIterator::FindSmallest() { + IteratorWrapper* smallest = NULL; + for (int i = 0; i < n_; i++) { + IteratorWrapper* child = &children_[i]; + if (child->Valid()) { + if (smallest == NULL) { + smallest = child; + } else if (comparator_->Compare(child->key(), smallest->key()) < 0) { + smallest = child; + } + } + } + current_ = smallest; +} + +void MergingIterator::FindLargest() { + IteratorWrapper* largest = NULL; + for (int i = n_-1; i >= 0; i--) { + IteratorWrapper* child = &children_[i]; + if (child->Valid()) { + if (largest == NULL) { + largest = child; + } else if (comparator_->Compare(child->key(), largest->key()) > 0) { + largest = child; + } + } + } + current_ = largest; +} +} + +Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { + assert(n >= 0); + if (n == 0) { + return NewEmptyIterator(); + } else if (n == 1) { + return list[0]; + } else { + return new MergingIterator(cmp, list, n); + } +} + +} diff --git a/table/merger.h b/table/merger.h new file mode 100644 index 0000000..71d9dc5 --- /dev/null +++ b/table/merger.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_ +#define STORAGE_LEVELDB_TABLE_MERGER_H_ + +namespace leveldb { + +class Comparator; +class Iterator; + +// Return an iterator that provided the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern Iterator* NewMergingIterator( + const Comparator* comparator, Iterator** children, int n); + +} + +#endif // STORAGE_LEVELDB_TABLE_MERGER_H_ diff --git a/table/table.cc b/table/table.cc new file mode 100644 index 0000000..9820753 --- /dev/null +++ b/table/table.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table.h" + +#include "leveldb/cache.h" +#include "leveldb/env.h" +#include "table/block.h" +#include "table/format.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" + +namespace leveldb { + +struct Table::Rep { + ~Rep() { + delete index_block; + } + + Options options; + Status status; + RandomAccessFile* file; + uint64_t cache_id; + + BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer + Block* index_block; +}; + +Status Table::Open(const Options& options, + RandomAccessFile* file, + uint64_t size, + Table** table) { + *table = NULL; + if (size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + char footer_space[Footer::kEncodedLength]; + Slice footer_input; + Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, + &footer_input, footer_space); + if (!s.ok()) return s; + + Footer footer; + s = footer.DecodeFrom(&footer_input); + if (!s.ok()) return s; + + // Read the index block + Block* index_block = NULL; + if (s.ok()) { + s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); + } + + if (s.ok()) { + // We've successfully read the footer and the index block: we're + // ready to serve requests. + Rep* rep = new Table::Rep; + rep->options = options; + rep->file = file; + rep->metaindex_handle = footer.metaindex_handle(); + rep->index_block = index_block; + rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); + *table = new Table(rep); + } else { + if (index_block) delete index_block; + } + + return s; +} + +Table::~Table() { + delete rep_; +} + +static void DeleteBlock(void* arg, void* ignored) { + delete reinterpret_cast(arg); +} + +static void DeleteCachedBlock(const Slice& key, void* value) { + Block* block = reinterpret_cast(value); + delete block; +} + +static void ReleaseBlock(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle); +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +Iterator* Table::BlockReader(void* arg, + const ReadOptions& options, + const Slice& index_value) { + Table* table = reinterpret_cast(arg); + Cache* block_cache = table->rep_->options.block_cache; + Block* block = NULL; + Cache::Handle* cache_handle = NULL; + + BlockHandle handle; + Slice input = index_value; + Status s = handle.DecodeFrom(&input); + // We intentionally allow extra stuff in index_value so that we + // can add more features in the future. + + if (s.ok()) { + if (block_cache != NULL) { + char cache_key_buffer[16]; + EncodeFixed64(cache_key_buffer, table->rep_->cache_id); + EncodeFixed64(cache_key_buffer+8, handle.offset()); + Slice key(cache_key_buffer, sizeof(cache_key_buffer)); + cache_handle = block_cache->Lookup(key); + if (cache_handle != NULL) { + block = reinterpret_cast(block_cache->Value(cache_handle)); + } else { + s = ReadBlock(table->rep_->file, options, handle, &block); + if (s.ok() && options.fill_cache) { + cache_handle = block_cache->Insert( + key, block, block->size(), &DeleteCachedBlock); + } + } + } else { + s = ReadBlock(table->rep_->file, options, handle, &block); + } + } + + Iterator* iter; + if (block != NULL) { + iter = block->NewIterator(table->rep_->options.comparator); + if (cache_handle == NULL) { + iter->RegisterCleanup(&DeleteBlock, block, NULL); + } else { + iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); + } + } else { + iter = NewErrorIterator(s); + } + return iter; +} + +Iterator* Table::NewIterator(const ReadOptions& options) const { + return NewTwoLevelIterator( + rep_->index_block->NewIterator(rep_->options.comparator), + &Table::BlockReader, const_cast(this), options); +} + +uint64_t Table::ApproximateOffsetOf(const Slice& key) const { + Iterator* index_iter = + rep_->index_block->NewIterator(rep_->options.comparator); + index_iter->Seek(key); + uint64_t result; + if (index_iter->Valid()) { + BlockHandle handle; + Slice input = index_iter->value(); + Status s = handle.DecodeFrom(&input); + if (s.ok()) { + result = handle.offset(); + } else { + // Strange: we can't decode the block handle in the index block. + // We'll just return the offset of the metaindex block, which is + // close to the whole file size for this case. + result = rep_->metaindex_handle.offset(); + } + } else { + // key is past the last key in the file. Approximate the offset + // by returning the offset of the metaindex block (which is + // right near the end of the file). + result = rep_->metaindex_handle.offset(); + } + delete index_iter; + return result; +} + +} diff --git a/table/table_builder.cc b/table/table_builder.cc new file mode 100644 index 0000000..7ec7ad2 --- /dev/null +++ b/table/table_builder.cc @@ -0,0 +1,227 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table_builder.h" + +#include +#include +#include "leveldb/comparator.h" +#include "leveldb/env.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/logging.h" + +namespace leveldb { + +struct TableBuilder::Rep { + Options options; + Options index_block_options; + WritableFile* file; + uint64_t offset; + Status status; + BlockBuilder data_block; + BlockBuilder index_block; + std::string last_key; + int64_t num_entries; + bool closed; // Either Finish() or Abandon() has been called. + + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + // + // Invariant: r->pending_index_entry is true only if data_block is empty. + bool pending_index_entry; + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + + Rep(const Options& opt, WritableFile* f) + : options(opt), + index_block_options(opt), + file(f), + offset(0), + data_block(&options), + index_block(&index_block_options), + num_entries(0), + closed(false), + pending_index_entry(false) { + index_block_options.block_restart_interval = 1; + } +}; + +TableBuilder::TableBuilder(const Options& options, WritableFile* file) + : rep_(new Rep(options, file)) { +} + +TableBuilder::~TableBuilder() { + assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_; +} + +Status TableBuilder::ChangeOptions(const Options& options) { + // Note: if more fields are added to Options, update + // this function to catch changes that should not be allowed to + // change in the middle of building a Table. + if (options.comparator != rep_->options.comparator) { + return Status::InvalidArgument("changing comparator while building table"); + } + + // Note that any live BlockBuilders point to rep_->options and therefore + // will automatically pick up the updated options. + rep_->options = options; + rep_->index_block_options = options; + rep_->index_block_options.block_restart_interval = 1; + return Status::OK(); +} + +void TableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->num_entries > 0) { + assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); + } + + if (r->pending_index_entry) { + assert(r->data_block.empty()); + r->options.comparator->FindShortestSeparator(&r->last_key, key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + r->pending_index_entry = false; + } + + r->last_key.assign(key.data(), key.size()); + r->num_entries++; + r->data_block.Add(key, value); + + const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); + if (estimated_block_size >= r->options.block_size) { + Flush(); + } +} + +void TableBuilder::Flush() { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->data_block.empty()) return; + assert(!r->pending_index_entry); + WriteBlock(&r->data_block, &r->pending_handle); + if (ok()) { + r->pending_index_entry = true; + r->status = r->file->Flush(); + } +} + +void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + Slice raw = block->Finish(); + + Slice block_contents; + CompressionType type = r->options.compression; + // TODO(postrelease): Support more compression options: zlib? + switch (type) { + case kNoCompression: + block_contents = raw; + break; + + case kSnappyCompression: { + std::string* compressed = &r->compressed_output; + if (port::Snappy_Compress(raw.data(), raw.size(), compressed) && + compressed->size() < raw.size() - (raw.size() / 8u)) { + block_contents = *compressed; + } else { + // Snappy not supported, or compressed less than 12.5%, so just + // store uncompressed form + block_contents = raw; + type = kNoCompression; + } + break; + } + } + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type + EncodeFixed32(trailer+1, crc32c::Mask(crc)); + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + } + } + r->compressed_output.clear(); + block->Reset(); +} + +Status TableBuilder::status() const { + return rep_->status; +} + +Status TableBuilder::Finish() { + Rep* r = rep_; + Flush(); + assert(!r->closed); + r->closed = true; + BlockHandle metaindex_block_handle; + BlockHandle index_block_handle; + if (ok()) { + BlockBuilder meta_index_block(&r->options); + // TODO(postrelease): Add stats and other meta blocks + WriteBlock(&meta_index_block, &metaindex_block_handle); + } + if (ok()) { + if (r->pending_index_entry) { + r->options.comparator->FindShortSuccessor(&r->last_key); + std::string handle_encoding; + r->pending_handle.EncodeTo(&handle_encoding); + r->index_block.Add(r->last_key, Slice(handle_encoding)); + r->pending_index_entry = false; + } + WriteBlock(&r->index_block, &index_block_handle); + } + if (ok()) { + Footer footer; + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } + } + return r->status; +} + +void TableBuilder::Abandon() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; +} + +uint64_t TableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t TableBuilder::FileSize() const { + return rep_->offset; +} + +} diff --git a/table/table_test.cc b/table/table_test.cc new file mode 100644 index 0000000..4b3e85e --- /dev/null +++ b/table/table_test.cc @@ -0,0 +1,841 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/table.h" + +#include +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "leveldb/table_builder.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +// Return reverse of "key". +// Used to test non-lexicographic comparators. +static std::string Reverse(const Slice& key) { + std::string str(key.ToString()); + std::string rev(str.rbegin(), str.rend()); + return rev; +} + +namespace { +class ReverseKeyComparator : public Comparator { + public: + virtual const char* Name() const { + return "leveldb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + std::string s = Reverse(*start); + std::string l = Reverse(limit); + BytewiseComparator()->FindShortestSeparator(&s, l); + *start = Reverse(s); + } + + virtual void FindShortSuccessor(std::string* key) const { + std::string s = Reverse(*key); + BytewiseComparator()->FindShortSuccessor(&s); + *key = Reverse(s); + } +}; +} +static ReverseKeyComparator reverse_key_comparator; + +static void Increment(const Comparator* cmp, std::string* key) { + if (cmp == BytewiseComparator()) { + key->push_back('\0'); + } else { + assert(cmp == &reverse_key_comparator); + std::string rev = Reverse(*key); + rev.push_back('\0'); + *key = Reverse(rev); + } +} + +// An STL comparator that uses a Comparator +namespace { +struct STLLessThan { + const Comparator* cmp; + + STLLessThan() : cmp(BytewiseComparator()) { } + STLLessThan(const Comparator* c) : cmp(c) { } + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } +}; +} + +class StringSink: public WritableFile { + public: + ~StringSink() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class StringSource: public RandomAccessFile { + public: + StringSource(const Slice& contents) + : contents_(contents.data(), contents.size()) { + } + + virtual ~StringSource() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + return Status::OK(); + } + + private: + std::string contents_; +}; + +typedef std::map KVMap; + +// Helper class for tests to unify the interface between +// BlockBuilder/TableBuilder and Block/Table. +class Constructor { + public: + explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { } + virtual ~Constructor() { } + + void Add(const std::string& key, const Slice& value) { + data_[key] = value.ToString(); + } + + // Finish constructing the data structure with all the keys that have + // been added so far. Returns the keys in sorted order in "*keys" + // and stores the key/value pairs in "*kvmap" + void Finish(const Options& options, + std::vector* keys, + KVMap* kvmap) { + *kvmap = data_; + keys->clear(); + for (KVMap::const_iterator it = data_.begin(); + it != data_.end(); + ++it) { + keys->push_back(it->first); + } + data_.clear(); + Status s = FinishImpl(options, *kvmap); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + // Construct the data structure from the data in "data" + virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; + + virtual size_t NumBytes() const = 0; + + virtual Iterator* NewIterator() const = 0; + + virtual const KVMap& data() { return data_; } + + virtual DB* db() const { return NULL; } // Overridden in DBConstructor + + private: + KVMap data_; +}; + +class BlockConstructor: public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp), + block_size_(-1), + block_(NULL) { } + ~BlockConstructor() { + delete block_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete block_; + block_ = NULL; + BlockBuilder builder(&options); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + } + // Open the block + Slice block_data = builder.Finish(); + block_size_ = block_data.size(); + char* block_data_copy = new char[block_size_]; + memcpy(block_data_copy, block_data.data(), block_size_); + block_ = new Block(block_data_copy, block_size_); + return Status::OK(); + } + virtual size_t NumBytes() const { return block_size_; } + + virtual Iterator* NewIterator() const { + return block_->NewIterator(comparator_); + } + + private: + const Comparator* comparator_; + int block_size_; + Block* block_; + + BlockConstructor(); +}; + +class TableConstructor: public Constructor { + public: + TableConstructor(const Comparator* cmp) + : Constructor(cmp), + source_(NULL), table_(NULL) { + } + ~TableConstructor() { + Reset(); + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + Reset(); + StringSink sink; + TableBuilder builder(options, &sink); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + ASSERT_TRUE(builder.status().ok()); + } + Status s = builder.Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + + ASSERT_EQ(sink.contents().size(), builder.FileSize()); + + // Open the table + source_ = new StringSource(sink.contents()); + Options table_options; + table_options.comparator = options.comparator; + return Table::Open(table_options, source_, sink.contents().size(), &table_); + } + virtual size_t NumBytes() const { return source_->Size(); } + + virtual Iterator* NewIterator() const { + return table_->NewIterator(ReadOptions()); + } + + uint64_t ApproximateOffsetOf(const Slice& key) const { + return table_->ApproximateOffsetOf(key); + } + + private: + void Reset() { + delete table_; + delete source_; + table_ = NULL; + source_ = NULL; + } + + StringSource* source_; + Table* table_; + + TableConstructor(); +}; + +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator: public Iterator { + public: + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + + virtual Slice key() const { + assert(Valid()); + ParsedInternalKey key; + if (!ParseInternalKey(iter_->key(), &key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return key.user_key; + } + + virtual Slice value() const { return iter_->value(); } + virtual Status status() const { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + Iterator* iter_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class MemTableConstructor: public Constructor { + public: + explicit MemTableConstructor(const Comparator* cmp) + : Constructor(cmp), + internal_comparator_(cmp) { + memtable_ = new MemTable(internal_comparator_); + } + ~MemTableConstructor() { + delete memtable_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete memtable_; + memtable_ = new MemTable(internal_comparator_); + int seq = 1; + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + memtable_->Add(seq, kTypeValue, it->first, it->second); + seq++; + } + return Status::OK(); + } + virtual size_t NumBytes() const { + return memtable_->ApproximateMemoryUsage(); + } + + virtual Iterator* NewIterator() const { + return new KeyConvertingIterator(memtable_->NewIterator()); + } + + private: + InternalKeyComparator internal_comparator_; + MemTable* memtable_; +}; + +class DBConstructor: public Constructor { + public: + explicit DBConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp) { + db_ = NULL; + NewDB(); + } + ~DBConstructor() { + delete db_; + } + virtual Status FinishImpl(const Options& options, const KVMap& data) { + delete db_; + db_ = NULL; + NewDB(); + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + WriteBatch batch; + batch.Put(it->first, it->second); + ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); + } + return Status::OK(); + } + virtual size_t NumBytes() const { + Range r("", "\xff\xff"); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + virtual Iterator* NewIterator() const { + return db_->NewIterator(ReadOptions()); + } + + virtual DB* db() const { return db_; } + + private: + void NewDB() { + std::string name = test::TmpDir() + "/table_testdb"; + + Options options; + options.comparator = comparator_; + Status status = DestroyDB(name, options); + ASSERT_TRUE(status.ok()) << status.ToString(); + + options.create_if_missing = true; + options.error_if_exists = true; + options.write_buffer_size = 10000; // Something small to force merging + status = DB::Open(options, name, &db_); + ASSERT_TRUE(status.ok()) << status.ToString(); + } + + const Comparator* comparator_; + DB* db_; +}; + +enum TestType { + TABLE_TEST, + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST, +}; + +struct TestArgs { + TestType type; + bool reverse_compare; + int restart_interval; +}; + +static const TestArgs kTestArgList[] = { + { TABLE_TEST, false, 16 }, + { TABLE_TEST, false, 1 }, + { TABLE_TEST, false, 1024 }, + { TABLE_TEST, true, 16 }, + { TABLE_TEST, true, 1 }, + { TABLE_TEST, true, 1024 }, + + { BLOCK_TEST, false, 16 }, + { BLOCK_TEST, false, 1 }, + { BLOCK_TEST, false, 1024 }, + { BLOCK_TEST, true, 16 }, + { BLOCK_TEST, true, 1 }, + { BLOCK_TEST, true, 1024 }, + + // Restart interval does not matter for memtables + { MEMTABLE_TEST, false, 16 }, + { MEMTABLE_TEST, true, 16 }, + + // Do not bother with restart interval variations for DB + { DB_TEST, false, 16 }, + { DB_TEST, true, 16 }, +}; +static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]); + +class Harness { + public: + Harness() : constructor_(NULL) { } + + void Init(const TestArgs& args) { + delete constructor_; + constructor_ = NULL; + options_ = Options(); + + options_.block_restart_interval = args.restart_interval; + // Use shorter block size for tests to exercise block boundary + // conditions more. + options_.block_size = 256; + if (args.reverse_compare) { + options_.comparator = &reverse_key_comparator; + } + switch (args.type) { + case TABLE_TEST: + constructor_ = new TableConstructor(options_.comparator); + break; + case BLOCK_TEST: + constructor_ = new BlockConstructor(options_.comparator); + break; + case MEMTABLE_TEST: + constructor_ = new MemTableConstructor(options_.comparator); + break; + case DB_TEST: + constructor_ = new DBConstructor(options_.comparator); + break; + } + } + + ~Harness() { + delete constructor_; + } + + void Add(const std::string& key, const std::string& value) { + constructor_->Add(key, value); + } + + void Test(Random* rnd) { + std::vector keys; + KVMap data; + constructor_->Finish(options_, &keys, &data); + + TestForwardScan(keys, data); + TestBackwardScan(keys, data); + TestRandomAccess(rnd, keys, data); + } + + void TestForwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + for (KVMap::const_iterator model_iter = data.begin(); + model_iter != data.end(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestBackwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + for (KVMap::const_reverse_iterator model_iter = data.rbegin(); + model_iter != data.rend(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Prev(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestRandomAccess(Random* rnd, + const std::vector& keys, + const KVMap& data) { + static const bool kVerbose = false; + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + KVMap::const_iterator model_iter = data.begin(); + if (kVerbose) fprintf(stderr, "---\n"); + for (int i = 0; i < 200; i++) { + const int toss = rnd->Uniform(5); + switch (toss) { + case 0: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Next\n"); + iter->Next(); + ++model_iter; + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 1: { + if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + iter->SeekToFirst(); + model_iter = data.begin(); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 2: { + std::string key = PickRandomKey(rnd, keys); + model_iter = data.lower_bound(key); + if (kVerbose) fprintf(stderr, "Seek '%s'\n", + EscapeString(key).c_str()); + iter->Seek(Slice(key)); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 3: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Prev\n"); + iter->Prev(); + if (model_iter == data.begin()) { + model_iter = data.end(); // Wrap around to invalid value + } else { + --model_iter; + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 4: { + if (kVerbose) fprintf(stderr, "SeekToLast\n"); + iter->SeekToLast(); + if (keys.empty()) { + model_iter = data.end(); + } else { + std::string last = data.rbegin()->first; + model_iter = data.lower_bound(last); + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + } + } + delete iter; + } + + std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { + if (it == data.end()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const KVMap& data, + const KVMap::const_reverse_iterator& it) { + if (it == data.rend()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const Iterator* it) { + if (!it->Valid()) { + return "END"; + } else { + return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; + } + } + + std::string PickRandomKey(Random* rnd, const std::vector& keys) { + if (keys.empty()) { + return "foo"; + } else { + const int index = rnd->Uniform(keys.size()); + std::string result = keys[index]; + switch (rnd->Uniform(3)) { + case 0: + // Return an existing key + break; + case 1: { + // Attempt to return something smaller than an existing key + if (result.size() > 0 && result[result.size()-1] > '\0') { + result[result.size()-1]--; + } + break; + } + case 2: { + // Return something larger than an existing key + Increment(options_.comparator, &result); + break; + } + } + return result; + } + } + + // Returns NULL if not running against a DB + DB* db() const { return constructor_->db(); } + + private: + Options options_; + Constructor* constructor_; +}; + +// Test the empty key +TEST(Harness, SimpleEmptyKey) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSingle) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleMulti) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSpecialKey) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + +TEST(Harness, Randomized) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + if ((num_entries % 10) == 0) { + fprintf(stderr, "case %d of %d: num_entries = %d\n", + (i + 1), int(kNumTestArgs), num_entries); + } + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + } + } +} + +TEST(Harness, RandomizedLongDB) { + Random rnd(test::RandomSeed()); + TestArgs args = { DB_TEST, false, 16 }; + Init(args); + int num_entries = 100000; + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + + // We must have created enough data to force merging + std::string l0_files, l1_files; + ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); + ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); + ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0); + +} + +class MemTableTest { }; + +TEST(MemTableTest, Simple) { + InternalKeyComparator cmp(BytewiseComparator()); + MemTable memtable(cmp); + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + batch.Put(std::string("k1"), std::string("v1")); + batch.Put(std::string("k2"), std::string("v2")); + batch.Put(std::string("k3"), std::string("v3")); + batch.Put(std::string("largekey"), std::string("vlarge")); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); + + Iterator* iter = memtable.NewIterator(); + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", + iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } + + delete iter; +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +class TableTest { }; + +TEST(TableTest, ApproximateOffsetOfPlain) { + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kNoCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); + +} + +static bool SnappyCompressionSupported() { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(in.data(), in.size(), &out); +} + +TEST(TableTest, ApproximateOffsetOfCompressed) { + if (!SnappyCompressionSupported()) { + fprintf(stderr, "skipping compression tests\n"); + return; + } + + Random rnd(301); + TableConstructor c(BytewiseComparator()); + std::string tmp; + c.Add("k01", "hello"); + c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + c.Add("k03", "hello3"); + c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + std::vector keys; + KVMap kvmap; + Options options; + options.block_size = 1024; + options.compression = kSnappyCompression; + c.Finish(options, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc new file mode 100644 index 0000000..24a1241 --- /dev/null +++ b/table/two_level_iterator.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" + +#include "leveldb/table.h" +#include "table/block.h" +#include "table/format.h" +#include "table/iterator_wrapper.h" + +namespace leveldb { + +namespace { + +typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&); + +class TwoLevelIterator: public Iterator { + public: + TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options); + + virtual ~TwoLevelIterator(); + + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + virtual void Next(); + virtual void Prev(); + + virtual bool Valid() const { + return data_iter_.Valid(); + } + virtual Slice key() const { + assert(Valid()); + return data_iter_.key(); + } + virtual Slice value() const { + assert(Valid()); + return data_iter_.value(); + } + virtual Status status() const { + // It'd be nice if status() returned a const Status& instead of a Status + if (!index_iter_.status().ok()) { + return index_iter_.status(); + } else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) { + return data_iter_.status(); + } else { + return status_; + } + } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetDataIterator(Iterator* data_iter); + void InitDataBlock(); + + BlockFunction block_function_; + void* arg_; + const ReadOptions options_; + Status status_; + IteratorWrapper index_iter_; + IteratorWrapper data_iter_; // May be NULL + // If data_iter_ is non-NULL, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the data_iter_. + std::string data_block_handle_; +}; + +TwoLevelIterator::TwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options) + : block_function_(block_function), + arg_(arg), + options_(options), + index_iter_(index_iter), + data_iter_(NULL) { +} + +TwoLevelIterator::~TwoLevelIterator() { +} + +void TwoLevelIterator::Seek(const Slice& target) { + index_iter_.Seek(target); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.Seek(target); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToFirst() { + index_iter_.SeekToFirst(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToLast() { + index_iter_.SeekToLast(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIterator::Next() { + assert(Valid()); + data_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::Prev() { + assert(Valid()); + data_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + + +void TwoLevelIterator::SkipEmptyDataBlocksForward() { + while (data_iter_.iter() == NULL || !data_iter_.Valid()) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + return; + } + index_iter_.Next(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToFirst(); + } +} + +void TwoLevelIterator::SkipEmptyDataBlocksBackward() { + while (data_iter_.iter() == NULL || !data_iter_.Valid()) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + return; + } + index_iter_.Prev(); + InitDataBlock(); + if (data_iter_.iter() != NULL) data_iter_.SeekToLast(); + } +} + +void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { + if (data_iter_.iter() != NULL) SaveError(data_iter_.status()); + data_iter_.Set(data_iter); +} + +void TwoLevelIterator::InitDataBlock() { + if (!index_iter_.Valid()) { + SetDataIterator(NULL); + } else { + Slice handle = index_iter_.value(); + if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) { + // data_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + Iterator* iter = (*block_function_)(arg_, options_, handle); + data_block_handle_.assign(handle.data(), handle.size()); + SetDataIterator(iter); + } + } +} + +} + +Iterator* NewTwoLevelIterator( + Iterator* index_iter, + BlockFunction block_function, + void* arg, + const ReadOptions& options) { + return new TwoLevelIterator(index_iter, block_function, arg, options); +} + +} diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h new file mode 100644 index 0000000..5909e2b --- /dev/null +++ b/table/two_level_iterator.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ +#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ + +#include "leveldb/iterator.h" + +namespace leveldb { + +struct ReadOptions; + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +extern Iterator* NewTwoLevelIterator( + Iterator* index_iter, + Iterator* (*block_function)( + void* arg, + const ReadOptions& options, + const Slice& index_value), + void* arg, + const ReadOptions& options); + +} + +#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ diff --git a/util/arena.cc b/util/arena.cc new file mode 100644 index 0000000..4bf6e36 --- /dev/null +++ b/util/arena.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include + +namespace leveldb { + +static const int kBlockSize = 4096; + +Arena::Arena() { + blocks_memory_ = 0; + alloc_ptr_ = NULL; // First allocation will allocate a block + alloc_bytes_remaining_ = 0; +} + +Arena::~Arena() { + for (int i = 0; i < blocks_.size(); i++) { + delete[] blocks_[i]; + } +} + +char* Arena::AllocateFallback(size_t bytes) { + if (bytes > kBlockSize / 4) { + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + char* result = AllocateNewBlock(bytes); + return result; + } + + // We waste the remaining space in the current block. + alloc_ptr_ = AllocateNewBlock(kBlockSize); + alloc_bytes_remaining_ = kBlockSize; + + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; +} + +char* Arena::AllocateAligned(size_t bytes) { + const int align = sizeof(void*); // We'll align to pointer size + assert((align & (align-1)) == 0); // Pointer size should be a power of 2 + size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); + size_t slop = (current_mod == 0 ? 0 : align - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = alloc_ptr_ + slop; + alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returned aligned memory + result = AllocateFallback(bytes); + } + assert((reinterpret_cast(result) & (align-1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + char* result = new char[block_bytes]; + blocks_memory_ += block_bytes; + blocks_.push_back(result); + return result; +} + +} diff --git a/util/arena.h b/util/arena.h new file mode 100644 index 0000000..fcb5d5b --- /dev/null +++ b/util/arena.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ +#define STORAGE_LEVELDB_UTIL_ARENA_H_ + +#include +#include +#include +#include + +namespace leveldb { + +class Arena { + public: + Arena(); + ~Arena(); + + // Return a pointer to a newly allocated memory block of "bytes" bytes. + char* Allocate(size_t bytes); + + // Allocate memory with the normal alignment guarantees provided by malloc + char* AllocateAligned(size_t bytes); + + // Returns an estimate of the total memory usage of data allocated + // by the arena (including space allocated but not yet used for user + // allocations). + size_t MemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*); + } + + private: + char* AllocateFallback(size_t bytes); + char* AllocateNewBlock(size_t block_bytes); + + // Allocation state + char* alloc_ptr_; + size_t alloc_bytes_remaining_; + + // Array of new[] allocated memory blocks + std::vector blocks_; + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_; + + // No copying allowed + Arena(const Arena&); + void operator=(const Arena&); +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + char* result = alloc_ptr_; + alloc_ptr_ += bytes; + alloc_bytes_remaining_ -= bytes; + return result; + } + return AllocateFallback(bytes); +} + +} + +#endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/util/arena_test.cc b/util/arena_test.cc new file mode 100644 index 0000000..c33b552 --- /dev/null +++ b/util/arena_test.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" + +#include "util/random.h" +#include "util/testharness.h" + +namespace leveldb { + +class ArenaTest { }; + +TEST(ArenaTest, Empty) { + Arena arena; +} + +TEST(ArenaTest, Simple) { + std::vector > allocated; + Arena arena; + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) ? rnd.Uniform(6000) : + (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena.AllocateAligned(s); + } else { + r = arena.Allocate(s); + } + + for (int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena.MemoryUsage(), bytes); + if (i > N/10) { + ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); + } + } + for (int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, i % 256); + } + } +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/cache.cc b/util/cache.cc new file mode 100644 index 0000000..d8a4426 --- /dev/null +++ b/util/cache.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) +#include +#elif defined(LEVELDB_PLATFORM_CHROMIUM) +#include "base/hash_tables.h" +#else +#include // TODO(sanjay): Switch to unordered_set when possible. +#endif + +#include + +#include "leveldb/cache.h" +#include "port/port.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace leveldb { + +Cache::~Cache() { +} + +namespace { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + size_t refs; // TODO(opt): Pack with "key_length"? + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// Pick a platform specific hash_set instantiation +#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) + // Microsoft's hash_set deviates from the standard. See + // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx + // for details. Basically the 2 param () operator is a less than and + // the 1 param () operator is a hash function. + struct HandleHashCompare : public stdext::hash_compare { + size_t operator() (LRUHandle* h) const { + Slice k = h->key(); + return Hash(k.data(), k.size(), 0); + } + bool operator() (LRUHandle* a, LRUHandle* b) const { + return a->key().compare(b->key()) < 0; + } + }; + typedef base::hash_set HandleTable; +#else + struct HandleHash { + inline size_t operator()(LRUHandle* h) const { + Slice k = h->key(); + return Hash(k.data(), k.size(), 0); + } + }; + + struct HandleEq { + inline bool operator()(LRUHandle* a, LRUHandle* b) const { + return a->key() == b->key(); + } + }; +# if defined(LEVELDB_PLATFORM_CHROMIUM) + typedef base::hash_set HandleTable; +# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) + typedef std::unordered_set HandleTable; +# else + typedef __gnu_cxx::hash_set HandleTable; +# endif +#endif + +class LRUCache : public Cache { + public: + explicit LRUCache(size_t capacity); + virtual ~LRUCache(); + + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)); + virtual Handle* Lookup(const Slice& key); + virtual void Release(Handle* handle); + virtual void* Value(Handle* handle); + virtual void Erase(const Slice& key); + virtual uint64_t NewId(); + + private: + void LRU_Remove(LRUHandle* e); + void LRU_Append(LRUHandle* e); + void Unref(LRUHandle* e); + + // Constructor parameters + const size_t capacity_; + + // mutex_ protects the following state. + port::Mutex mutex_; + size_t usage_; + uint64_t last_id_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; +}; + +LRUCache::LRUCache(size_t capacity) + : capacity_(capacity), + usage_(0), + last_id_(0) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +LRUCache::~LRUCache() { + table_.clear(); + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + assert(e->refs == 1); // Error if caller has an unreleased handle + Unref(e); + e = next; + } +} + +void LRUCache::Unref(LRUHandle* e) { + assert(e->refs > 0); + e->refs--; + if (e->refs <= 0) { + usage_ -= e->charge; + (*e->deleter)(e->key(), e->value); + free(e); + } +} + +void LRUCache::LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; +} + +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; +} + +Cache::Handle* LRUCache::Lookup(const Slice& key) { + MutexLock l(&mutex_); + + LRUHandle dummy; + dummy.next = &dummy; + dummy.value = const_cast(&key); + HandleTable::iterator iter = table_.find(&dummy); + if (iter == table_.end()) { + return NULL; + } else { + LRUHandle* e = const_cast(*iter); + e->refs++; + LRU_Remove(e); + LRU_Append(e); + return reinterpret_cast(e); + } +} + +void* LRUCache::Value(Handle* handle) { + return reinterpret_cast(handle)->value; +} + +void LRUCache::Release(Handle* handle) { + MutexLock l(&mutex_); + Unref(reinterpret_cast(handle)); +} + +Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + MutexLock l(&mutex_); + + LRUHandle* e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); + LRU_Append(e); + usage_ += charge; + + std::pair p = table_.insert(e); + if (!p.second) { + // Kill existing entry + LRUHandle* old = const_cast(*(p.first)); + LRU_Remove(old); + table_.erase(p.first); + table_.insert(e); + Unref(old); + } + + while (usage_ > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + LRU_Remove(old); + table_.erase(old); + Unref(old); + } + + return reinterpret_cast(e); +} + +void LRUCache::Erase(const Slice& key) { + MutexLock l(&mutex_); + + LRUHandle dummy; + dummy.next = &dummy; + dummy.value = const_cast(&key); + HandleTable::iterator iter = table_.find(&dummy); + if (iter != table_.end()) { + LRUHandle* e = const_cast(*iter); + LRU_Remove(e); + table_.erase(iter); + Unref(e); + } +} + +uint64_t LRUCache::NewId() { + MutexLock l(&mutex_); + return ++(last_id_); +} + +} // end anonymous namespace + +Cache* NewLRUCache(size_t capacity) { + return new LRUCache(capacity); +} + +} diff --git a/util/cache_test.cc b/util/cache_test.cc new file mode 100644 index 0000000..dbab988 --- /dev/null +++ b/util/cache_test.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/cache.h" + +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace leveldb { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest { + public: + static CacheTest* current_; + + static void Deleter(const Slice& key, void* v) { + current_->deleted_keys_.push_back(DecodeKey(key)); + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kCacheSize = 100; + std::vector deleted_keys_; + std::vector deleted_values_; + Cache* cache_; + + CacheTest() : cache_(NewLRUCache(kCacheSize)) { + current_ = this; + } + + ~CacheTest() { + delete cache_; + } + + int Lookup(int key) { + Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); + const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); + if (handle != NULL) { + cache_->Release(handle); + } + return r; + } + + void Insert(int key, int value, int charge = 1) { + cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(int key) { + cache_->Erase(EncodeKey(key)); + } +}; +CacheTest* CacheTest::current_; + +TEST(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); +} + +TEST(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + + // Frequently used entry must be kept around + for (int i = 0; i < kCacheSize; i++) { + Insert(1000+i, 2000+i); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(200, deleted_keys_[0]); + ASSERT_EQ(201, deleted_values_[0]); +} + +TEST(CacheTest, HeavyEntry) { + Insert(100, 101); + Insert(200, 201, kCacheSize); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/coding.cc b/util/coding.cc new file mode 100644 index 0000000..680e2ad --- /dev/null +++ b/util/coding.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +namespace leveldb { + +void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return reinterpret_cast(ptr); +} + +void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B-1)) | B; + v >>= 7; + } + *(ptr++) = v; + return reinterpret_cast(ptr); +} + +void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return NULL; +} + +bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return NULL; +} + +bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == NULL) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const char* GetLengthPrefixedSlice(const char* p, const char* limit, + Slice* result) { + uint32_t len; + p = GetVarint32Ptr(p, limit, &len); + if (p == NULL) return NULL; + if (p + len > limit) return NULL; + *result = Slice(p, len); + return p + len; +} + +bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len; + if (GetVarint32(input, &len) && + input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +} diff --git a/util/coding.h b/util/coding.h new file mode 100644 index 0000000..8755968 --- /dev/null +++ b/util/coding.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ +#define STORAGE_LEVELDB_UTIL_CODING_H_ + +#include +#include +#include +#include "leveldb/slice.h" +#include "port/port.h" + +namespace leveldb { + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// NULL on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(ptr[0])) + | (static_cast(ptr[1]) << 8) + | (static_cast(ptr[2]) << 16) + | (static_cast(ptr[3]) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +} + +#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/util/coding_test.cc b/util/coding_test.cc new file mode 100644 index 0000000..a8dba04 --- /dev/null +++ b/util/coding_test.cc @@ -0,0 +1,173 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "util/testharness.h" + +namespace leveldb { + +class Coding { }; + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + uint64_t actual; + actual = DecodeFixed64(p); + ASSERT_EQ(v-1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+1, actual); + p += sizeof(uint64_t); + } +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != NULL); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast(0)); + values.push_back(~static_cast(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power-1); + values.push_back(power+1); + }; + + std::string s; + for (int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != NULL); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); + +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) + == NULL); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); + } + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) + == NULL); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); + } + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/comparator.cc b/util/comparator.cc new file mode 100644 index 0000000..e2b27e3 --- /dev/null +++ b/util/comparator.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "leveldb/comparator.h" +#include "leveldb/slice.h" +#include "util/logging.h" + +namespace leveldb { + +Comparator::~Comparator() { } + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast((*start)[diff_index]); + if (diff_byte < static_cast(0xff) && + diff_byte + 1 < static_cast(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (int i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} +static const BytewiseComparatorImpl bytewise; + +const Comparator* BytewiseComparator() { + return &bytewise; +} + +} diff --git a/util/crc32c.cc b/util/crc32c.cc new file mode 100644 index 0000000..28c2401 --- /dev/null +++ b/util/crc32c.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. + +#include "util/crc32c.h" + +#include +#include "util/coding.h" + +namespace leveldb { +namespace crc32c { + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast(p)); +} + +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint32_t l = crc ^ 0xffffffffu; + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) +#define STEP4 do { \ + uint32_t c = l ^ LE_LOAD32(p); \ + p += 4; \ + l = table3_[c & 0xff] ^ \ + table2_[(c >> 8) & 0xff] ^ \ + table1_[(c >> 16) & 0xff] ^ \ + table0_[c >> 24]; \ +} while (0) + + // Point x at first 4-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); + if (x <= e) { + // Process bytes until finished or p is 4-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + STEP4; STEP4; STEP4; STEP4; + } + // Process bytes 4 at a time + while ((e-p) >= 4) { + STEP4; + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP4 +#undef STEP1 + return l ^ 0xffffffffu; +} + +} +} diff --git a/util/crc32c.h b/util/crc32c.h new file mode 100644 index 0000000..938d8ff --- /dev/null +++ b/util/crc32c.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_ +#define STORAGE_LEVELDB_UTIL_CRC32C_H_ + +#include +#include + +namespace leveldb { +namespace crc32c { + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { + return Extend(0, data, n); +} + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} +} + +#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc new file mode 100644 index 0000000..ba9e804 --- /dev/null +++ b/util/crc32c_test.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/crc32c.h" +#include "util/testharness.h" + +namespace leveldb { +namespace crc32c { + +class CRC { }; + +TEST(CRC, StandardResults) { + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = i; + } + ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = 31 - i; + } + ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); +} + +TEST(CRC, Values) { + ASSERT_NE(Value("a", 1), Value("foo", 3)); +} + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), + Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +} +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/env.cc b/util/env.cc new file mode 100644 index 0000000..e5297e7 --- /dev/null +++ b/util/env.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/env.h" + +namespace leveldb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +FileLock::~FileLock() { +} + +void Log(Env* env, WritableFile* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + env->Logv(info_log, format, ap); + va_end(ap); +} + +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + WritableFile* file; + Status s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok()) { + s = file->Close(); + } + delete file; // Will auto-close if we did not close above + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + data->clear(); + SequentialFile* file; + Status s = env->NewSequentialFile(fname, &file); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + delete file; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +} diff --git a/util/env_chromium.cc b/util/env_chromium.cc new file mode 100644 index 0000000..7edc7a9 --- /dev/null +++ b/util/env_chromium.cc @@ -0,0 +1,603 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "base/at_exit.h" +#include "base/file_path.h" +#include "base/file_util.h" +#include "base/lazy_instance.h" +#include "base/memory/ref_counted.h" +#include "base/message_loop.h" +#include "base/platform_file.h" +#include "base/process_util.h" +#include "base/synchronization/lock.h" +#include "base/sys_info.h" +#include "base/task.h" +#include "base/threading/platform_thread.h" +#include "base/threading/thread.h" +#include "base/utf_string_conversions.h" +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +#if defined(OS_WIN) +#include +#include "base/win/win_util.h" +#endif + +#if defined(OS_MACOSX) || defined(OS_WIN) +// The following are glibc-specific +extern "C" { +size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { + return fread(ptr, size, n, file); +} + +size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) { + return fwrite(ptr, size, n, file); +} + +int fflush_unlocked(FILE *file) { + return fflush(file); +} + +int fdatasync(int fildes) { +#if defined(OS_WIN) + return _commit(fildes); +#else + return fsync(fildes); +#endif +} +} +#endif + +namespace leveldb { + +namespace { + +class Thread; + +static const ::FilePath::CharType kLevelDBTestDirectoryPrefix[] + = FILE_PATH_LITERAL("leveldb-test-"); + +::FilePath CreateFilePath(const std::string& file_path) { +#if defined(OS_WIN) + return FilePath(UTF8ToUTF16(file_path)); +#else + return FilePath(file_path); +#endif +} + +std::string FilePathToString(const ::FilePath& file_path) { +#if defined(OS_WIN) + return UTF16ToUTF8(file_path.value()); +#else + return file_path.value(); +#endif +} + +// TODO(jorlow): This should be moved into Chromium's base. +const char* PlatformFileErrorString(const ::base::PlatformFileError& error) { + switch (error) { + case ::base::PLATFORM_FILE_ERROR_FAILED: + return "Opening file failed."; + case ::base::PLATFORM_FILE_ERROR_IN_USE: + return "File currently in use."; + case ::base::PLATFORM_FILE_ERROR_EXISTS: + return "File already exists."; + case ::base::PLATFORM_FILE_ERROR_NOT_FOUND: + return "File not found."; + case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED: + return "Access denied."; + case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED: + return "Too many files open."; + case ::base::PLATFORM_FILE_ERROR_NO_MEMORY: + return "Out of memory."; + case ::base::PLATFORM_FILE_ERROR_NO_SPACE: + return "No space left on drive."; + case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY: + return "Not a directory."; + case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION: + return "Invalid operation."; + case ::base::PLATFORM_FILE_ERROR_SECURITY: + return "Security error."; + case ::base::PLATFORM_FILE_ERROR_ABORT: + return "File operation aborted."; + case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE: + return "The supplied path was not a file."; + case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY: + return "The file was not empty."; + } + NOTIMPLEMENTED(); + return "Unknown error."; +} + +class ChromiumSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + ChromiumSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~ChromiumSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + } + return s; + } +}; + +class ChromiumRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + ::base::PlatformFile file_; + + public: + ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file) + : filename_(fname), file_(file) { } + virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + int r = ::base::ReadPlatformFile(file_, offset, scratch, n); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = Status::IOError(filename_, "Could not preform read"); + } + return s; + } +}; + +class ChromiumWritableFile : public WritableFile { + private: + std::string filename_; + FILE* file_; + + public: + ChromiumWritableFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + + ~ChromiumWritableFile() { + if (file_ != NULL) { + // Ignoring any potential errors + fclose(file_); + } + } + + virtual Status Append(const Slice& data) { + size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); + Status result; + if (r != data.size()) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } + + virtual Status Close() { + Status result; + if (fclose(file_) != 0) { + result = Status::IOError(filename_, strerror(errno)); + } + file_ = NULL; + return result; + } + + virtual Status Flush() { + Status result; + if (fflush_unlocked(file_) != 0) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } + + virtual Status Sync() { + Status result; + if ((fflush_unlocked(file_) != 0) || + (fdatasync(fileno(file_)) != 0)) { + result = Status::IOError(filename_, strerror(errno)); + } + return result; + } +}; + +class ChromiumFileLock : public FileLock { + public: + ::base::PlatformFile file_; +}; + +class ChromiumEnv : public Env { + public: + ChromiumEnv(); + virtual ~ChromiumEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "rb"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new ChromiumSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN; + bool created; + ::base::PlatformFileError error_code; + ::base::PlatformFile file = ::base::CreatePlatformFile( + CreateFilePath(fname), flags, &created, &error_code); + if (error_code != ::base::PLATFORM_FILE_OK) { + *result = NULL; + return Status::IOError(fname, PlatformFileErrorString(error_code)); + } + *result = new ChromiumRandomAccessFile(fname, file); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + *result = NULL; + FILE* f = fopen(fname.c_str(), "wb"); + if (f == NULL) { + return Status::IOError(fname, strerror(errno)); + } else { + *result = new ChromiumWritableFile(fname, f); + return Status::OK(); + } + } + + virtual bool FileExists(const std::string& fname) { + return ::file_util::PathExists(CreateFilePath(fname)); + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + ::file_util::FileEnumerator iter( + CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES); + ::FilePath current = iter.Next(); + while (!current.empty()) { + result->push_back(FilePathToString(current.BaseName())); + current = iter.Next(); + } + // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so + // we'll always return OK. Maybe manually check for error + // conditions like the file not existing? + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + // TODO(jorlow): Should we assert this is a file? + if (!::file_util::Delete(CreateFilePath(fname), false)) { + result = Status::IOError(fname, "Could not delete file."); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (!::file_util::CreateDirectory(CreateFilePath(name))) { + result = Status::IOError(name, "Could not create directory."); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + // TODO(jorlow): Should we assert this is a directory? + if (!::file_util::Delete(CreateFilePath(name), false)) { + result = Status::IOError(name, "Could not delete directory."); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + int64_t signed_size; + if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { + *size = 0; + s = Status::IOError(fname, "Could not determine file size."); + } else { + *size = static_cast(signed_size); + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& dst) { + Status result; + if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) { + result = Status::IOError(src, "Could not rename file."); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS | + ::base::PLATFORM_FILE_READ | + ::base::PLATFORM_FILE_WRITE | + ::base::PLATFORM_FILE_EXCLUSIVE_READ | + ::base::PLATFORM_FILE_EXCLUSIVE_WRITE; + bool created; + ::base::PlatformFileError error_code; + ::base::PlatformFile file = ::base::CreatePlatformFile( + CreateFilePath(fname), flags, &created, &error_code); + if (error_code != ::base::PLATFORM_FILE_OK) { + result = Status::IOError(fname, PlatformFileErrorString(error_code)); + } else { + ChromiumFileLock* my_lock = new ChromiumFileLock; + my_lock->file_ = file; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + ChromiumFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (!::base::ClosePlatformFile(my_lock->file_)) { + result = Status::IOError("Could not close lock file."); + } + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual std::string UserIdentifier() { +#if defined(OS_WIN) + std::wstring user_sid; + bool ret = ::base::win::GetUserSidString(&user_sid); + DCHECK(ret); + return UTF16ToUTF8(user_sid); +#else + char buf[100]; + snprintf(buf, sizeof(buf), "%d", int(geteuid())); + return buf; +#endif + } + + virtual Status GetTestDirectory(std::string* path) { + mu_.Acquire(); + if (test_directory_.empty()) { + if (!::file_util::CreateNewTempDirectory(kLevelDBTestDirectoryPrefix, + &test_directory_)) { + mu_.Release(); + return Status::IOError("Could not create temp directory."); + } + } + *path = FilePathToString(test_directory_); + mu_.Release(); + return Status::OK(); + } + + virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + // TODO(jorlow): We may want to just use Chromium's built in logging. + + uint64_t thread_id = 0; + // Coppied from base/logging.cc. +#if defined(OS_WIN) + thread_id = GetCurrentThreadId(); +#elif defined(OS_MACOSX) + thread_id = mach_thread_self(); +#elif defined(OS_LINUX) + thread_id = syscall(__NR_gettid); +#elif defined(OS_FREEBSD) || defined(OS_NACL) + // TODO(BSD): find a better thread ID + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); +#endif + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + ::base::Time::Exploded t; + ::base::Time::Now().LocalExplode(&t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.year, + t.month, + t.day_of_month, + t.hour, + t.minute, + t.second, + static_cast(t.millisecond) * 1000, + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + info_log->Append(Slice(base, p - base)); + info_log->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) { + ::base::Time::Exploded t; + ::base::Time::Now().LocalExplode(&t); + return snprintf(buffer, size, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d", + t.year, + t.month, + t.day_of_month, + t.hour, + t.minute, + t.second, + static_cast(t.millisecond) * 1000); + } + + virtual uint64_t NowMicros() { + return ::base::TimeTicks::HighResNow().ToInternalValue(); + } + + virtual void SleepForMicroseconds(int micros) { + // Round up to the next millisecond. + ::base::PlatformThread::Sleep((micros + 999) / 1000); + } + + private: + // BGThread() is the body of the background thread + void BGThread(); + static void BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + } + + FilePath test_directory_; + + size_t page_size_; + ::base::Lock mu_; + ::base::ConditionVariable bgsignal_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + BGQueue queue_; +}; + +ChromiumEnv::ChromiumEnv() + : page_size_(::base::SysInfo::VMAllocationGranularity()), + bgsignal_(&mu_), + started_bgthread_(false) { +#if defined(OS_MACOSX) + ::base::EnableTerminationOnHeapCorruption(); + ::base::EnableTerminationOnOutOfMemory(); +#endif // OS_MACOSX +} + +class Thread : public ::base::PlatformThread::Delegate { + public: + Thread(void (*function)(void* arg), void* arg) + : function_(function), arg_(arg) { + ::base::PlatformThreadHandle handle; + bool success = ::base::PlatformThread::Create(0, this, &handle); + DCHECK(success); + } + virtual ~Thread() {} + virtual void ThreadMain() { + (*function_)(arg_); + delete this; + } + + private: + void (*function_)(void* arg); + void* arg_; +}; + +void ChromiumEnv::Schedule(void (*function)(void*), void* arg) { + mu_.Acquire(); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + StartThread(&ChromiumEnv::BGThreadWrapper, this); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + bgsignal_.Signal(); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + mu_.Release(); +} + +void ChromiumEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + mu_.Acquire(); + while (queue_.empty()) { + bgsignal_.Wait(); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + mu_.Release(); + (*function)(arg); + } +} + +void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { + new Thread(function, arg); // Will self-delete. +} + +::base::LazyInstance > + default_env(::base::LINKER_INITIALIZED); + +} + +Env* Env::Default() { + return default_env.Pointer(); +} + +} diff --git a/util/env_posix.cc b/util/env_posix.cc new file mode 100644 index 0000000..5cddb0c --- /dev/null +++ b/util/env_posix.cc @@ -0,0 +1,599 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(LEVELDB_PLATFORM_ANDROID) +#include +#endif +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +namespace leveldb { + +namespace { + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f) + : filename_(fname), file_(f) { } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + } + return s; + } +}; + +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd) + : filename_(fname), fd_(fd) { } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = Status::IOError(filename_, strerror(errno)); + } + return s; + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + + // Have we done an munmap of unsynced data? + bool pending_sync_; + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + void UnmapCurrentRegion() { + if (base_ != NULL) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + munmap(base_, limit_ - base_); + file_offset_ += limit_ - base_; + base_ = NULL; + limit_ = NULL; + last_sync_ = NULL; + dst_ = NULL; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + } + + bool MapNewRegion() { + assert(base_ == NULL); + if (ftruncate(fd_, file_offset_ + map_size_) < 0) { + return false; + } + void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return false; + } + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return true; + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(NULL), + limit_(NULL), + dst_(NULL), + last_sync_(NULL), + file_offset_(0), + pending_sync_(false) { + assert((page_size & (page_size - 1)) == 0); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + UnmapCurrentRegion(); + MapNewRegion(); + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + UnmapCurrentRegion(); + if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + fd_ = -1; + base_ = NULL; + limit_ = NULL; + return s; + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = Status::IOError(filename_, strerror(errno)); + } + } + + return s; + } +}; + +static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + return fcntl(fd, F_SETLK, &f); +} + +class PosixFileLock : public FileLock { + public: + int fd_; +}; + +class PosixEnv : public Env { + public: + PosixEnv(); + virtual ~PosixEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result) { + FILE* f = fopen(fname.c_str(), "r"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixSequentialFile(fname, f); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + int fd = open(fname.c_str(), O_RDONLY); + if (fd < 0) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } + *result = new PosixRandomAccessFile(fname, fd); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + *result = NULL; + s = Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixMmapFile(fname, fd, page_size_); + } + return s; + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == NULL) { + return Status::IOError(dir, strerror(errno)); + } + struct dirent* entry; + while ((entry = readdir(d)) != NULL) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = Status::IOError(fname, strerror(errno)); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = Status::IOError(name, strerror(errno)); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = Status::IOError(name, strerror(errno)); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = Status::IOError(fname, strerror(errno)); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = Status::IOError(src, strerror(errno)); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = NULL; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = Status::IOError(fname, strerror(errno)); + } else if (LockOrUnlock(fd, true) == -1) { + result = Status::IOError("lock " + fname, strerror(errno)); + close(fd); + } else { + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = Status::IOError(strerror(errno)); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + pthread_t tid = pthread_self(); + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, NULL); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + info_log->Append(Slice(base, p - base)); + info_log->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + + private: + void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } + } + + // BGThread() is the body of the background thread + void BGThread(); + static void* BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + return NULL; + } + + size_t page_size_; + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + pthread_t bgthread_; + bool started_bgthread_; + + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + BGQueue queue_; +}; + +PosixEnv::PosixEnv() : page_size_(getpagesize()), + started_bgthread_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + // Start background thread if necessary + if (!started_bgthread_) { + started_bgthread_ = true; + PthreadCall( + "create thread", + pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); + } + + // If the queue is currently empty, the background thread may currently be + // waiting. + if (queue_.empty()) { + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::BGThread() { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + while (queue_.empty()) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return NULL; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, NULL, &StartThreadWrapper, state)); +} + +} + +static pthread_once_t once = PTHREAD_ONCE_INIT; +static Env* default_env; +static void InitDefaultEnv() { default_env = new PosixEnv; } + +Env* Env::Default() { + pthread_once(&once, InitDefaultEnv); + return default_env; +} + +} diff --git a/util/env_test.cc b/util/env_test.cc new file mode 100644 index 0000000..3c253be --- /dev/null +++ b/util/env_test.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/env.h" + +#include "port/port.h" +#include "util/testharness.h" + +namespace leveldb { + +static const int kDelayMicros = 100000; + +class EnvPosixTest { + private: + port::Mutex mu_; + std::string events_; + + public: + Env* env_; + EnvPosixTest() : env_(Env::Default()) { } +}; + +static void SetBool(void* ptr) { + *(reinterpret_cast(ptr)) = true; +} + +TEST(EnvPosixTest, RunImmediately) { + bool called = false; + env_->Schedule(&SetBool, &called); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(called); +} + +TEST(EnvPosixTest, RunMany) { + int last_id = 0; + + struct CB { + int* last_id_ptr; // Pointer to shared slot + int id; // Order# for the execution of this callback + + CB(int* p, int i) : last_id_ptr(p), id(i) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + ASSERT_EQ(cb->id-1, *cb->last_id_ptr); + *cb->last_id_ptr = cb->id; + } + }; + + // Schedule in different order than start time + CB cb1(&last_id, 1); + CB cb2(&last_id, 2); + CB cb3(&last_id, 3); + CB cb4(&last_id, 4); + env_->Schedule(&CB::Run, &cb1); + env_->Schedule(&CB::Run, &cb2); + env_->Schedule(&CB::Run, &cb3); + env_->Schedule(&CB::Run, &cb4); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(4, last_id); +} + +struct State { + port::Mutex mu; + int val; + int num_running; +}; + +static void ThreadBody(void* arg) { + State* s = reinterpret_cast(arg); + s->mu.Lock(); + s->val += 1; + s->num_running -= 1; + s->mu.Unlock(); +} + +TEST(EnvPosixTest, StartThread) { + State state; + state.val = 0; + state.num_running = 3; + for (int i = 0; i < 3; i++) { + env_->StartThread(&ThreadBody, &state); + } + while (true) { + state.mu.Lock(); + int num = state.num_running; + state.mu.Unlock(); + if (num == 0) { + break; + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(state.val, 3); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/hash.cc b/util/hash.cc new file mode 100644 index 0000000..d19afd1 --- /dev/null +++ b/util/hash.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "util/coding.h" +#include "util/hash.h" + +namespace leveldb { + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // Similar to murmur hash + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = seed ^ (n * m); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + case 3: + h += data[2] << 16; + // fall through + case 2: + h += data[1] << 8; + // fall through + case 1: + h += data[0]; + h *= m; + h ^= (h >> r); + break; + } + return h; +} + + +} diff --git a/util/hash.h b/util/hash.h new file mode 100644 index 0000000..8889d56 --- /dev/null +++ b/util/hash.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Simple hash function used for internal data structures + +#ifndef STORAGE_LEVELDB_UTIL_HASH_H_ +#define STORAGE_LEVELDB_UTIL_HASH_H_ + +#include +#include + +namespace leveldb { + +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +} + +#endif // STORAGE_LEVELDB_UTIL_HASH_H_ diff --git a/util/histogram.cc b/util/histogram.cc new file mode 100644 index 0000000..c5178ef --- /dev/null +++ b/util/histogram.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "port/port.h" +#include "util/histogram.h" + +namespace leveldb { + +const double Histogram::kBucketLimit[kNumBuckets] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, + 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, + 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, + 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, + 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, + 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, + 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, + 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, + 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, + 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, + 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, + 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, + 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, + 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, + 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000, + 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0, + 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0, + 1e200, +}; + +void Histogram::Clear() { + min_ = kBucketLimit[kNumBuckets-1]; + max_ = 0; + num_ = 0; + sum_ = 0; + sum_squares_ = 0; + for (int i = 0; i < kNumBuckets; i++) { + buckets_[i] = 0; + } +} + +void Histogram::Add(double value) { + // Linear search is fast enough for our usage in db_bench + int b = 0; + while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { + b++; + } + buckets_[b] += 1.0; + if (min_ > value) min_ = value; + if (max_ < value) max_ = value; + num_++; + sum_ += value; + sum_squares_ += (value * value); +} + +double Histogram::Median() const { + return Percentile(50.0); +} + +double Histogram::Percentile(double p) const { + double threshold = num_ * (p / 100.0); + double sum = 0; + for (int b = 0; b < kNumBuckets; b++) { + sum += buckets_[b]; + if (sum >= threshold) { + // Scale linearly within this bucket + double left_point = (b == 0) ? 0 : kBucketLimit[b-1]; + double right_point = kBucketLimit[b]; + double left_sum = sum - buckets_[b]; + double right_sum = sum; + double pos = (threshold - left_sum) / (right_sum - left_sum); + double r = left_point + (right_point - left_point) * pos; + if (r < min_) r = min_; + if (r > max_) r = max_; + return r; + } + } + return max_; +} + +double Histogram::Average() const { + if (num_ == 0.0) return 0; + return sum_ / num_; +} + +double Histogram::StandardDeviation() const { + if (num_ == 0.0) return 0; + double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); + return sqrt(variance); +} + +std::string Histogram::ToString() const { + std::string r; + char buf[200]; + snprintf(buf, sizeof(buf), + "Count: %.0f Average: %.4f StdDev: %.2f\n", + num_, Average(), StandardDeviation()); + r.append(buf); + snprintf(buf, sizeof(buf), + "Min: %.4f Median: %.4f Max: %.4f\n", + (num_ == 0.0 ? 0.0 : min_), Median(), max_); + r.append(buf); + r.append("------------------------------------------------------\n"); + const double mult = 100.0 / num_; + double sum = 0; + for (int b = 0; b < kNumBuckets; b++) { + if (buckets_[b] <= 0.0) continue; + sum += buckets_[b]; + snprintf(buf, sizeof(buf), + "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ", + ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left + kBucketLimit[b], // right + buckets_[b], // count + mult * buckets_[b], // percentage + mult * sum); // cumulative percentage + r.append(buf); + + // Add hash marks based on percentage; 20 marks for 100%. + int marks = static_cast(20*(buckets_[b] / num_) + 0.5); + r.append(marks, '#'); + r.push_back('\n'); + } + return r; +} + +} diff --git a/util/histogram.h b/util/histogram.h new file mode 100644 index 0000000..f72f122 --- /dev/null +++ b/util/histogram.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ +#define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ + +#include + +namespace leveldb { + +class Histogram { + public: + Histogram() { } + ~Histogram() { } + + void Clear(); + void Add(double value); + + std::string ToString() const; + + private: + double min_; + double max_; + double num_; + double sum_; + double sum_squares_; + + enum { kNumBuckets = 154 }; + static const double kBucketLimit[kNumBuckets]; + double buckets_[kNumBuckets]; + + double Median() const; + double Percentile(double p) const; + double Average() const; + double StandardDeviation() const; +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ diff --git a/util/logging.cc b/util/logging.cc new file mode 100644 index 0000000..5c9bd4a --- /dev/null +++ b/util/logging.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#include +#include +#include +#include +#include "leveldb/env.h" +#include "leveldb/slice.h" + +namespace leveldb { + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (int i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} diff --git a/util/logging.h b/util/logging.h new file mode 100644 index 0000000..1cd0a4b --- /dev/null +++ b/util/logging.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_ +#define STORAGE_LEVELDB_UTIL_LOGGING_H_ + +#include +#include +#include +#include "port/port.h" + +namespace leveldb { + +class Slice; +class WritableFile; + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} + +#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/util/mutexlock.h b/util/mutexlock.h new file mode 100644 index 0000000..05fe279 --- /dev/null +++ b/util/mutexlock.h @@ -0,0 +1,39 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ +#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ + +#include "port/port.h" + +namespace leveldb { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { + this->mu_->Lock(); + } + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; + // No copying allowed + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +} + + +#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ diff --git a/util/options.cc b/util/options.cc new file mode 100644 index 0000000..29272fe --- /dev/null +++ b/util/options.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/options.h" + +#include "leveldb/comparator.h" +#include "leveldb/env.h" + +namespace leveldb { + +Options::Options() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(NULL), + write_buffer_size(4<<20), + max_open_files(1000), + large_value_threshold(65536), + block_cache(NULL), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression) { +} + + +} diff --git a/util/random.h b/util/random.h new file mode 100644 index 0000000..2d458e8 --- /dev/null +++ b/util/random.h @@ -0,0 +1,59 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_ +#define STORAGE_LEVELDB_UTIL_RANDOM_H_ + +#include + +namespace leveldb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = (product >> 31) + (product & M); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/util/status.cc b/util/status.cc new file mode 100644 index 0000000..d9b7195 --- /dev/null +++ b/util/status.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "port/port.h" +#include "leveldb/status.h" + +namespace leveldb { + +Status::Status(Code code, const Slice& msg, const Slice& msg2) { + assert(code != kOk); + state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); + if (!msg2.empty()) { + state_->second.append(": "); + state_->second.append(msg2.data(), msg2.size()); + } +} + +std::string Status::ToString() const { + if (state_ == NULL) { + return "OK"; + } else { + char tmp[30]; + const char* type; + switch (state_->first) { + case kOk: + type = "OK"; + break; + case kNotFound: + type = "NotFound"; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(state_->first)); + type = tmp; + break; + } + std::string result(type); + if (!state_->second.empty()) { + result.append(state_->second); + } + return result; + } +} + +} diff --git a/util/testharness.cc b/util/testharness.cc new file mode 100644 index 0000000..b686ac3 --- /dev/null +++ b/util/testharness.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testharness.h" + +#include +#include + +namespace leveldb { +namespace test { + +namespace { +struct Test { + const char* base; + const char* name; + void (*func)(); +}; +std::vector* tests; +} + +bool RegisterTest(const char* base, const char* name, void (*func)()) { + if (tests == NULL) { + tests = new std::vector; + } + Test t; + t.base = base; + t.name = name; + t.func = func; + tests->push_back(t); + return true; +} + +int RunAllTests() { + int num = 0; + if (tests != NULL) { + for (int i = 0; i < tests->size(); i++) { + const Test& t = (*tests)[i]; + fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); + (*t.func)(); + ++num; + } + } + fprintf(stderr, "==== PASSED %d tests\n", num); + return 0; +} + +std::string TmpDir() { + std::string dir; + Status s = Env::Default()->GetTestDirectory(&dir); + ASSERT_TRUE(s.ok()) << s.ToString(); + return dir; +} + +int RandomSeed() { + const char* env = getenv("TEST_RANDOM_SEED"); + int result = (env != NULL ? atoi(env) : 301); + if (result <= 0) { + result = 301; + } + return result; +} + +} +} diff --git a/util/testharness.h b/util/testharness.h new file mode 100644 index 0000000..13ab914 --- /dev/null +++ b/util/testharness.h @@ -0,0 +1,129 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ +#define STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ + +#include +#include +#include +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "util/random.h" + +namespace leveldb { +namespace test { + +// Run all tests registered by the TEST() macro. +// Returns 0 if all tests pass. +// Dies or returns a non-zero value if some test fails. +extern int RunAllTests(); + +// Return the directory to use for temporary storage. +extern std::string TmpDir(); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +extern int RandomSeed(); + +// An instance of Tester is allocated to hold temporary state during +// the execution of an assertion. +class Tester { + private: + bool ok_; + const char* fname_; + int line_; + std::stringstream ss_; + + public: + Tester(const char* f, int l) + : ok_(true), fname_(f), line_(l) { + } + + ~Tester() { + if (!ok_) { + fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + exit(1); + } + } + + Tester& Is(bool b, const char* msg) { + if (!b) { + ss_ << " Assertion failure " << msg; + ok_ = false; + } + return *this; + } + + Tester& IsOk(const Status& s) { + if (!s.ok()) { + ss_ << " " << s.ToString(); + ok_ = false; + } + return *this; + } + +#define BINARY_OP(name,op) \ + template \ + Tester& name(const X& x, const Y& y) { \ + if (! (x op y)) { \ + ss_ << " failed: " << x << (" " #op " ") << y; \ + ok_ = false; \ + } \ + return *this; \ + } + + BINARY_OP(IsEq, ==) + BINARY_OP(IsNe, !=) + BINARY_OP(IsGe, >=) + BINARY_OP(IsGt, >) + BINARY_OP(IsLe, <=) + BINARY_OP(IsLt, <) +#undef BINARY_OP + + // Attach the specified value to the error message if an error has occurred + template + Tester& operator<<(const V& value) { + if (!ok_) { + ss_ << " " << value; + } + return *this; + } +}; + +#define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c) +#define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) +#define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) +#define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) +#define ASSERT_GT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) +#define ASSERT_LE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) +#define ASSERT_LT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) + +#define TCONCAT(a,b) TCONCAT1(a,b) +#define TCONCAT1(a,b) a##b + +#define TEST(base,name) \ +class TCONCAT(_Test_,name) : public base { \ + public: \ + void _Run(); \ + static void _RunIt() { \ + TCONCAT(_Test_,name) t; \ + t._Run(); \ + } \ +}; \ +bool TCONCAT(_Test_ignored_,name) = \ + ::leveldb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ +void TCONCAT(_Test_,name)::_Run() + +// Register the specified test. Typically not used directly, but +// invoked via the macro expansion of TEST. +extern bool RegisterTest(const char* base, const char* name, void (*func)()); + + +} +} + +#endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ diff --git a/util/testutil.cc b/util/testutil.cc new file mode 100644 index 0000000..8d6cf3c --- /dev/null +++ b/util/testutil.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testutil.h" + +#include "util/random.h" + +namespace leveldb { +namespace test { + +Slice RandomString(Random* rnd, int len, std::string* dst) { + dst->resize(len); + for (int i = 0; i < len; i++) { + (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' + } + return Slice(*dst); +} + +std::string RandomKey(Random* rnd, int len) { + // Make sure to generate a wide variety of characters so we + // test the boundary conditions for short-key optimizations. + static const char kTestChars[] = { + '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' + }; + std::string result; + for (int i = 0; i < len; i++) { + result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; + } + return result; +} + + +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data; + RandomString(rnd, raw, &raw_data); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + +} +} diff --git a/util/testutil.h b/util/testutil.h new file mode 100644 index 0000000..a150c1a --- /dev/null +++ b/util/testutil.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_ +#define STORAGE_LEVELDB_UTIL_TESTUTIL_H_ + +#include "leveldb/env.h" +#include "leveldb/slice.h" +#include "util/random.h" + +namespace leveldb { +namespace test { + +// Store in *dst a random string of length "len" and return a Slice that +// references the generated data. +extern Slice RandomString(Random* rnd, int len, std::string* dst); + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +extern std::string RandomKey(Random* rnd, int len); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + ErrorEnv() : EnvWrapper(Env::Default()), + writable_file_error_(false), + num_writable_file_errors_(0) { } + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result) { + if (writable_file_error_) { + ++num_writable_file_errors_; + *result = NULL; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result); + } +}; + +} +} + +#endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_ -- cgit v1.2.3 From 1511be6edb54b6ade2bfad94256f76bc191e92ec Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Wed, 20 Apr 2011 22:48:11 +0000 Subject: @20776309 * env_chromium.cc should not export symbols. * Fix MSVC warnings. * Removed large value support. * Fix broken reference to documentation file git-svn-id: http://leveldb.googlecode.com/svn/trunk@24 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 5 - README | 4 +- TODO | 4 +- db/builder.cc | 9 -- db/builder.h | 6 +- db/corruption_test.cc | 26 +---- db/db_bench.cc | 22 ---- db/db_impl.cc | 217 ++++++--------------------------------- db/db_impl.h | 23 ----- db/db_iter.cc | 101 +------------------ db/db_test.cc | 251 +++++++--------------------------------------- db/dbformat.cc | 65 ------------ db/dbformat.h | 53 +--------- db/dbformat_test.cc | 15 --- db/filename.cc | 19 ---- db/filename.h | 16 +-- db/filename_test.cc | 76 ++++---------- db/log_writer.cc | 4 +- db/repair.cc | 40 +++----- db/version_edit.cc | 43 +------- db/version_edit.h | 18 ---- db/version_edit_test.cc | 6 +- db/version_set.cc | 129 ++++-------------------- db/version_set.h | 28 +----- db/write_batch.cc | 16 --- db/write_batch_internal.h | 4 - db/write_batch_test.cc | 23 ----- doc/impl.html | 13 +-- doc/index.html | 11 -- include/leveldb/options.h | 12 +-- leveldb.gyp | 12 --- port/port_android.h | 8 -- port/port_chromium.h | 7 -- port/port_example.h | 5 - port/port_posix.h | 5 - table/block.cc | 4 +- table/block_builder.cc | 2 +- table/format.cc | 4 +- util/arena.cc | 2 +- util/coding.cc | 2 +- util/comparator.cc | 2 +- util/logging.cc | 2 +- util/options.cc | 1 - util/random.h | 2 +- 44 files changed, 152 insertions(+), 1165 deletions(-) diff --git a/Makefile b/Makefile index 7569701..43ac23d 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,6 @@ LIBOBJECTS = \ ./db/version_set.o \ ./db/write_batch.o \ ./port/port_posix.o \ - ./port/sha1_portable.o \ ./table/block.o \ ./table/block_builder.o \ ./table/format.o \ @@ -63,7 +62,6 @@ TESTS = \ env_test \ filename_test \ log_test \ - sha1_test \ skiplist_test \ table_test \ version_edit_test \ @@ -115,9 +113,6 @@ log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ -sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ - skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ diff --git a/README b/README index c97e43c..3618ade 100644 --- a/README +++ b/README @@ -2,10 +2,10 @@ leveldb: A key-value store Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) The code under this directory implements a system for maintaining a -persistent key/value store. +persistent key/value store. See doc/index.html for more explanation. -See doc/db_layout.txt for a brief overview of the implementation. +See doc/impl.html for a brief overview of the implementation. The public interface is in include/*.h. Callers should not include or rely on the details of any other header files in this package. Those diff --git a/TODO b/TODO index 2f848b8..ce81439 100644 --- a/TODO +++ b/TODO @@ -8,7 +8,7 @@ db object stores, etc. can be done in the background anyway, so probably not that important. -api changes? -- Efficient large value reading and writing +api changes: +- Make it wrappable Faster Get implementation diff --git a/db/builder.cc b/db/builder.cc index 6c8e6b8..9f132d7 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -38,15 +38,6 @@ Status BuildTable(const std::string& dbname, for (; iter->Valid(); iter->Next()) { Slice key = iter->key(); meta->largest.DecodeFrom(key); - if (ExtractValueType(key) == kTypeLargeValueRef) { - if (iter->value().size() != LargeValueRef::ByteSize()) { - s = Status::Corruption("invalid indirect reference hash value (L0)"); - break; - } - edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()), - meta->number, - iter->key()); - } builder->Add(key, iter->value()); } diff --git a/db/builder.h b/db/builder.h index 4efcb04..5dd17b6 100644 --- a/db/builder.h +++ b/db/builder.h @@ -20,9 +20,9 @@ class VersionEdit; // Build a Table file from the contents of *iter. The generated file // will be named according to meta->number. On success, the rest of // *meta will be filled with metadata about the generated table, and -// large value refs and the added file information will be added to -// *edit. If no data is present in *iter, meta->file_size will be set -// to zero, and no Table file will be produced. +// the file information will be added to *edit. If no data is present +// in *iter, meta->file_size will be set to zero, and no Table file +// will be produced. extern Status BuildTable(const std::string& dbname, Env* env, const Options& options, diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 63d8d8b..12d176e 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -121,11 +121,10 @@ class CorruptionTest { std::vector filenames; ASSERT_OK(env_.GetChildren(dbname_, &filenames)); uint64_t number; - LargeValueRef large_ref; FileType type; std::vector candidates; for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type) && + if (ParseFileName(filenames[i], &number, &type) && type == filetype) { candidates.push_back(dbname_ + "/" + filenames[i]); } @@ -276,29 +275,6 @@ TEST(CorruptionTest, SequenceNumberRecovery) { ASSERT_EQ("v6", v); } -TEST(CorruptionTest, LargeValueRecovery) { - Options options; - options.large_value_threshold = 10000; - Reopen(&options); - - Random rnd(301); - std::string big; - ASSERT_OK(db_->Put(WriteOptions(), - "foo", test::RandomString(&rnd, 100000, &big))); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ(big, v); - - RepairDB(); - Reopen(); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ(big, v); - - Reopen(); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ(big, v); -} - TEST(CorruptionTest, CorruptedDescriptor) { ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); DBImpl* dbi = reinterpret_cast(db_); diff --git a/db/db_bench.cc b/db/db_bench.cc index 849ebfa..d1cbdc0 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -28,7 +28,6 @@ // readreverse -- read N values in reverse order // readrandom -- read N values in random order // crc32c -- repeated crc32c of 4K of data -// sha1 -- repeated SHA1 computation over 4K of data // Meta operations: // compact -- Compact the entire DB // stats -- Print DB stats @@ -48,7 +47,6 @@ static const char* FLAGS_benchmarks = "readreverse," "fill100K," "crc32c," - "sha1," "snappycomp," "snappyuncomp," ; @@ -366,8 +364,6 @@ class Benchmark { Compact(); } else if (name == Slice("crc32c")) { Crc32c(4096, "(4K per op)"); - } else if (name == Slice("sha1")) { - SHA1(4096, "(4K per op)"); } else if (name == Slice("snappycomp")) { SnappyCompress(); } else if (name == Slice("snappyuncomp")) { @@ -406,24 +402,6 @@ class Benchmark { message_ = label; } - void SHA1(int size, const char* label) { - // SHA1 about 100MB of data total - std::string data(size, 'x'); - int64_t bytes = 0; - char sha1[20]; - while (bytes < 100 * 1048576) { - port::SHA1_Hash(data.data(), size, sha1); - FinishedSingleOp(); - bytes += size; - } - - // Print so result is not dead - fprintf(stderr, "... sha1=%02x...\r", static_cast(sha1[0])); - - bytes_ = bytes; - message_ = label; - } - void SnappyCompress() { Slice input = gen_.Generate(Options().block_size); int64_t bytes = 0; diff --git a/db/db_impl.cc b/db/db_impl.cc index d012236..3b9e04e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -81,8 +81,8 @@ class NullWritableFile : public WritableFile { // Fix user-supplied options to be reasonable template static void ClipToRange(T* ptr, V minvalue, V maxvalue) { - if (*ptr > maxvalue) *ptr = maxvalue; - if (*ptr < minvalue) *ptr = minvalue; + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; } Options SanitizeOptions(const std::string& dbname, const InternalKeyComparator* icmp, @@ -91,7 +91,6 @@ Options SanitizeOptions(const std::string& dbname, result.comparator = icmp; ClipToRange(&result.max_open_files, 20, 50000); ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); - ClipToRange(&result.large_value_threshold, 16<<10, 1<<30); ClipToRange(&result.block_size, 1<<10, 4<<20); if (result.info_log == NULL) { // Open a log file in the same directory as the db @@ -213,15 +212,12 @@ void DBImpl::DeleteObsoleteFiles() { std::set live = pending_outputs_; versions_->AddLiveFiles(&live); - versions_->CleanupLargeValueRefs(live); - std::vector filenames; env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose uint64_t number; - LargeValueRef large_ref; FileType type; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { bool keep = true; switch (type) { case kLogFile: @@ -241,9 +237,6 @@ void DBImpl::DeleteObsoleteFiles() { // be recorded in pending_outputs_, which is inserted into "live" keep = (live.find(number) != live.end()); break; - case kLargeValueFile: - keep = versions_->LargeValueIsLive(large_ref); - break; case kCurrentFile: case kDBLockFile: case kInfoLogFile: @@ -599,7 +592,7 @@ void DBImpl::CleanupCompaction(CompactionState* compact) { assert(compact->outfile == NULL); } delete compact->outfile; - for (int i = 0; i < compact->outputs.size(); i++) { + for (size_t i = 0; i < compact->outputs.size(); i++) { const CompactionState::Output& out = compact->outputs[i]; pending_outputs_.erase(out.number); } @@ -695,7 +688,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { // Add compaction outputs compact->compaction->AddInputDeletions(compact->compaction->edit()); const int level = compact->compaction->level(); - for (int i = 0; i < compact->outputs.size(); i++) { + for (size_t i = 0; i < compact->outputs.size(); i++) { const CompactionState::Output& out = compact->outputs[i]; compact->compaction->edit()->AddFile( level + 1, @@ -710,7 +703,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { DeleteObsoleteFiles(); } else { // Discard any files we may have created during this failed compaction - for (int i = 0; i < compact->outputs.size(); i++) { + for (size_t i = 0; i < compact->outputs.size(); i++) { env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); } } @@ -811,7 +804,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " "%d smallest_snapshot: %d", ikey.user_key.ToString().c_str(), - (int)ikey.sequence, ikey.type, kTypeLargeValueRef, drop, + (int)ikey.sequence, ikey.type, kTypeValue, drop, compact->compaction->IsBaseLevelForKey(ikey.user_key), (int)last_sequence_for_key, (int)compact->smallest_snapshot); #endif @@ -828,26 +821,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { compact->current_output()->smallest.DecodeFrom(key); } compact->current_output()->largest.DecodeFrom(key); - - if (ikey.type == kTypeLargeValueRef) { - if (input->value().size() != LargeValueRef::ByteSize()) { - if (options_.paranoid_checks) { - status = Status::Corruption("invalid large value ref"); - break; - } else { - Log(env_, options_.info_log, - "compaction found invalid large value ref"); - } - } else { - compact->compaction->edit()->AddLargeValueRef( - LargeValueRef::FromRef(input->value()), - compact->current_output()->number, - input->key()); - compact->builder->Add(key, input->value()); - } - } else { - compact->builder->Add(key, input->value()); - } + compact->builder->Add(key, input->value()); // Close output file if it is big enough if (compact->builder->FileSize() >= @@ -881,7 +855,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { stats.bytes_read += compact->compaction->input(which, i)->file_size; } } - for (int i = 0; i < compact->outputs.size(); i++) { + for (size_t i = 0; i < compact->outputs.size(); i++) { stats.bytes_written += compact->outputs[i].file_size; } @@ -985,40 +959,27 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { Status status; - - WriteBatch* final = NULL; - { - MutexLock l(&mutex_); - status = MakeRoomForWrite(false); // May temporarily release lock and wait - - uint64_t last_sequence = versions_->LastSequence(); - if (status.ok()) { - status = HandleLargeValues(last_sequence + 1, updates, &final); + MutexLock l(&mutex_); + status = MakeRoomForWrite(false); // May temporarily release lock and wait + uint64_t last_sequence = versions_->LastSequence(); + if (status.ok()) { + WriteBatchInternal::SetSequence(updates, last_sequence + 1); + last_sequence += WriteBatchInternal::Count(updates); + versions_->SetLastSequence(last_sequence); + + // Add to log and apply to memtable + status = log_->AddRecord(WriteBatchInternal::Contents(updates)); + if (status.ok() && options.sync) { + status = logfile_->Sync(); } if (status.ok()) { - WriteBatchInternal::SetSequence(final, last_sequence + 1); - last_sequence += WriteBatchInternal::Count(final); - versions_->SetLastSequence(last_sequence); - - // Add to log and apply to memtable - status = log_->AddRecord(WriteBatchInternal::Contents(final)); - if (status.ok() && options.sync) { - status = logfile_->Sync(); - } - if (status.ok()) { - status = WriteBatchInternal::InsertInto(final, mem_); - } - } - - if (options.post_write_snapshot != NULL) { - *options.post_write_snapshot = - status.ok() ? snapshots_.New(last_sequence) : NULL; + status = WriteBatchInternal::InsertInto(updates, mem_); } } - if (final != updates) { - delete final; + if (options.post_write_snapshot != NULL) { + *options.post_write_snapshot = + status.ok() ? snapshots_.New(last_sequence) : NULL; } - return status; } @@ -1070,124 +1031,6 @@ Status DBImpl::MakeRoomForWrite(bool force) { return s; } -bool DBImpl::HasLargeValues(const WriteBatch& batch) const { - if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) { - for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) { - if (it.op() == kTypeValue && - it.value().size() >= options_.large_value_threshold) { - return true; - } - } - } - return false; -} - -// Given "raw_value", determines the appropriate compression format to use -// and stores the data that should be written to the large value file in -// "*file_bytes", and sets "*ref" to the appropriate large value reference. -// May use "*scratch" as backing store for "*file_bytes". -void DBImpl::MaybeCompressLargeValue( - const Slice& raw_value, - Slice* file_bytes, - std::string* scratch, - LargeValueRef* ref) { - switch (options_.compression) { - case kSnappyCompression: { - if (port::Snappy_Compress(raw_value.data(), raw_value.size(), scratch) && - (scratch->size() < (raw_value.size() / 8) * 7)) { - *file_bytes = *scratch; - *ref = LargeValueRef::Make(raw_value, kSnappyCompression); - return; - } - - // Less than 12.5% compression: just leave as uncompressed data - break; - } - case kNoCompression: - // Use default code outside of switch - break; - } - // Store as uncompressed data - *file_bytes = raw_value; - *ref = LargeValueRef::Make(raw_value, kNoCompression); -} - -Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq, - WriteBatch* updates, - WriteBatch** final) { - if (!HasLargeValues(*updates)) { - // Fast path: no large values found - *final = updates; - } else { - // Copy *updates to a new WriteBatch, replacing the references to - *final = new WriteBatch; - SequenceNumber seq = assigned_seq; - for (WriteBatchInternal::Iterator it(*updates); !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeValue: - if (it.value().size() < options_.large_value_threshold) { - (*final)->Put(it.key(), it.value()); - } else { - std::string scratch; - Slice file_bytes; - LargeValueRef large_ref; - MaybeCompressLargeValue( - it.value(), &file_bytes, &scratch, &large_ref); - InternalKey ikey(it.key(), seq, kTypeLargeValueRef); - if (versions_->RegisterLargeValueRef( - large_ref, versions_->LogNumber(), ikey)) { - // TODO(opt): avoid holding the lock here (but be careful about - // another thread doing a Write and switching logs or - // having us get a different "assigned_seq" value). - - uint64_t tmp_number = versions_->NewFileNumber(); - pending_outputs_.insert(tmp_number); - std::string tmp = TempFileName(dbname_, tmp_number); - WritableFile* file; - Status s = env_->NewWritableFile(tmp, &file); - if (!s.ok()) { - return s; // Caller will delete *final - } - - file->Append(file_bytes); - - s = file->Close(); - delete file; - - if (s.ok()) { - const std::string fname = - LargeValueFileName(dbname_, large_ref); - s = env_->RenameFile(tmp, fname); - } else { - Log(env_, options_.info_log, "Write large value: %s", - s.ToString().c_str()); - } - pending_outputs_.erase(tmp_number); - - if (!s.ok()) { - env_->DeleteFile(tmp); // Cleanup; intentionally ignoring error - return s; // Caller will delete *final - } - } - - // Put an indirect reference in the write batch in place - // of large value - WriteBatchInternal::PutLargeValueRef(*final, it.key(), large_ref); - } - break; - case kTypeLargeValueRef: - return Status::Corruption("Corrupted write batch"); - break; - case kTypeDeletion: - (*final)->Delete(it.key()); - break; - } - seq = seq + 1; - } - } - return Status::OK(); -} - bool DBImpl::GetProperty(const Slice& property, std::string* value) { value->clear(); @@ -1205,7 +1048,8 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { return false; } else { char buf[100]; - snprintf(buf, sizeof(buf), "%d", versions_->NumLevelFiles(level)); + snprintf(buf, sizeof(buf), "%d", + versions_->NumLevelFiles(static_cast(level))); *value = buf; return true; } @@ -1325,10 +1169,9 @@ Status DestroyDB(const std::string& dbname, const Options& options) { Status result = env->LockFile(LockFileName(dbname), &lock); if (result.ok()) { uint64_t number; - LargeValueRef large_ref; FileType type; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type)) { + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { Status del = env->DeleteFile(dbname + "/" + filenames[i]); if (result.ok() && !del.ok()) { result = del; diff --git a/db/db_impl.h b/db/db_impl.h index 1f685f0..7699d8c 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -92,29 +92,6 @@ class DBImpl : public DB { Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); Status MakeRoomForWrite(bool force /* compact even if there is room? */); - bool HasLargeValues(const WriteBatch& batch) const; - - // Process data in "*updates" and return a status. "assigned_seq" - // is the sequence number assigned to the first mod in "*updates". - // If no large values are encountered, "*final" is set to "updates". - // If large values were encountered, registers the references of the - // large values with the VersionSet, writes the large values to - // files (if appropriate), and allocates a new WriteBatch with the - // large values replaced with indirect references and stores a - // pointer to the new WriteBatch in *final. If *final != updates on - // return, then the client should delete *final when no longer - // needed. Returns OK on success, and an appropriate error - // otherwise. - Status HandleLargeValues(SequenceNumber assigned_seq, - WriteBatch* updates, - WriteBatch** final); - - // Helper routine for HandleLargeValues - void MaybeCompressLargeValue( - const Slice& raw_value, - Slice* file_bytes, - std::string* scratch, - LargeValueRef* ref); struct CompactionState; diff --git a/db/db_iter.cc b/db/db_iter.cc index 31c2a38..0be18ff 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -53,13 +53,11 @@ class DBIter: public Iterator { user_comparator_(cmp), iter_(iter), sequence_(s), - large_(NULL), direction_(kForward), valid_(false) { } virtual ~DBIter() { delete iter_; - delete large_; } virtual bool Valid() const { return valid_; } virtual Slice key() const { @@ -68,20 +66,10 @@ class DBIter: public Iterator { } virtual Slice value() const { assert(valid_); - Slice raw_value = (direction_ == kForward) ? iter_->value() : saved_value_; - if (large_ == NULL) { - return raw_value; - } else { - MutexLock l(&large_->mutex); - if (!large_->produced) { - ReadIndirectValue(raw_value); - } - return large_->value; - } + return (direction_ == kForward) ? iter_->value() : saved_value_; } virtual Status status() const { if (status_.ok()) { - if (large_ != NULL && !large_->status.ok()) return large_->status; return iter_->status(); } else { return status_; @@ -95,29 +83,14 @@ class DBIter: public Iterator { virtual void SeekToLast(); private: - struct Large { - port::Mutex mutex; - std::string value; - bool produced; - Status status; - }; - void FindNextUserEntry(bool skipping, std::string* skip); void FindPrevUserEntry(); bool ParseKey(ParsedInternalKey* key); - void ReadIndirectValue(Slice ref) const; inline void SaveKey(const Slice& k, std::string* dst) { dst->assign(k.data(), k.size()); } - inline void ForgetLargeValue() { - if (large_ != NULL) { - delete large_; - large_ = NULL; - } - } - inline void ClearSavedValue() { if (saved_value_.capacity() > 1048576) { std::string empty; @@ -136,7 +109,6 @@ class DBIter: public Iterator { Status status_; std::string saved_key_; // == current key when direction_==kReverse std::string saved_value_; // == current raw value when direction_==kReverse - Large* large_; // Non-NULL if value is an indirect reference Direction direction_; bool valid_; @@ -156,7 +128,6 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { void DBIter::Next() { assert(valid_); - ForgetLargeValue(); if (direction_ == kReverse) { // Switch directions? direction_ = kForward; @@ -185,7 +156,6 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { // Loop until we hit an acceptable entry to yield assert(iter_->Valid()); assert(direction_ == kForward); - assert(large_ == NULL); do { ParsedInternalKey ikey; if (ParseKey(&ikey) && ikey.sequence <= sequence_) { @@ -197,17 +167,12 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { skipping = true; break; case kTypeValue: - case kTypeLargeValueRef: if (skipping && user_comparator_->Compare(ikey.user_key, *skip) <= 0) { // Entry hidden } else { valid_ = true; saved_key_.clear(); - if (ikey.type == kTypeLargeValueRef) { - large_ = new Large; - large_->produced = false; - } return; } break; @@ -221,7 +186,6 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { void DBIter::Prev() { assert(valid_); - ForgetLargeValue(); if (direction_ == kForward) { // Switch directions? // iter_ is pointing at the current entry. Scan backwards until @@ -249,7 +213,6 @@ void DBIter::Prev() { void DBIter::FindPrevUserEntry() { assert(direction_ == kReverse); - assert(large_ == NULL); ValueType value_type = kTypeDeletion; if (iter_->Valid()) { @@ -286,16 +249,11 @@ void DBIter::FindPrevUserEntry() { direction_ = kForward; } else { valid_ = true; - if (value_type == kTypeLargeValueRef) { - large_ = new Large; - large_->produced = false; - } } } void DBIter::Seek(const Slice& target) { direction_ = kForward; - ForgetLargeValue(); ClearSavedValue(); saved_key_.clear(); AppendInternalKey( @@ -310,7 +268,6 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { direction_ = kForward; - ForgetLargeValue(); ClearSavedValue(); iter_->SeekToFirst(); if (iter_->Valid()) { @@ -322,67 +279,11 @@ void DBIter::SeekToFirst() { void DBIter::SeekToLast() { direction_ = kReverse; - ForgetLargeValue(); ClearSavedValue(); iter_->SeekToLast(); FindPrevUserEntry(); } -void DBIter::ReadIndirectValue(Slice ref) const { - assert(!large_->produced); - large_->produced = true; - LargeValueRef large_ref; - if (ref.size() != LargeValueRef::ByteSize()) { - large_->status = Status::Corruption("malformed large value reference"); - return; - } - memcpy(large_ref.data, ref.data(), LargeValueRef::ByteSize()); - std::string fname = LargeValueFileName(*dbname_, large_ref); - RandomAccessFile* file; - Status s = env_->NewRandomAccessFile(fname, &file); - uint64_t file_size = 0; - if (s.ok()) { - s = env_->GetFileSize(fname, &file_size); - } - if (s.ok()) { - uint64_t value_size = large_ref.ValueSize(); - large_->value.resize(value_size); - Slice result; - s = file->Read(0, file_size, &result, - const_cast(large_->value.data())); - if (s.ok()) { - if (result.size() == file_size) { - switch (large_ref.compression_type()) { - case kNoCompression: { - if (result.data() != large_->value.data()) { - large_->value.assign(result.data(), result.size()); - } - break; - } - case kSnappyCompression: { - std::string uncompressed; - if (port::Snappy_Uncompress(result.data(), result.size(), - &uncompressed) && - uncompressed.size() == large_ref.ValueSize()) { - swap(uncompressed, large_->value); - } else { - s = Status::Corruption( - "Unable to read entire compressed large value file"); - } - } - } - } else { - s = Status::Corruption("Unable to read entire large value file"); - } - } - delete file; // Ignore errors on closing - } - if (!s.ok()) { - large_->value.clear(); - large_->status = s; - } -} - } // anonymous namespace Iterator* NewDBIterator( diff --git a/db/db_test.cc b/db/db_test.cc index 04de331..f828e3d 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -119,9 +119,6 @@ class DBTest { case kTypeValue: result += iter->value().ToString(); break; - case kTypeLargeValueRef: - result += "LARGEVALUE(" + EscapeString(iter->value()) + ")"; - break; case kTypeDeletion: result += "DEL"; break; @@ -153,26 +150,6 @@ class DBTest { return size; } - std::set LargeValueFiles() const { - // Return the set of large value files that exist in the database - std::vector filenames; - env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose - uint64_t number; - LargeValueRef large_ref; - FileType type; - std::set live; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type) && - type == kLargeValueFile) { - fprintf(stderr, " live: %s\n", - LargeValueRefToFilenameString(large_ref).c_str()); - live.insert(large_ref); - } - } - fprintf(stderr, "Found %d live large value files\n", (int)live.size()); - return live; - } - void Compact(const Slice& start, const Slice& limit) { dbfull()->TEST_CompactMemTable(); int max_level_with_files = 1; @@ -471,7 +448,6 @@ TEST(DBTest, MinorCompactionsHappen) { TEST(DBTest, RecoverWithLargeLog) { { Options options; - options.large_value_threshold = 1048576; Reopen(&options); ASSERT_OK(Put("big1", std::string(200000, '1'))); ASSERT_OK(Put("big2", std::string(200000, '2'))); @@ -484,7 +460,6 @@ TEST(DBTest, RecoverWithLargeLog) { // we flush table files in the middle of a large log file. Options options; options.write_buffer_size = 100000; - options.large_value_threshold = 1048576; Reopen(&options); ASSERT_EQ(NumTableFilesAtLevel(0), 3); ASSERT_EQ(std::string(200000, '1'), Get("big1")); @@ -497,7 +472,6 @@ TEST(DBTest, RecoverWithLargeLog) { TEST(DBTest, CompactionsGenerateMultipleFiles) { Options options; options.write_buffer_size = 100000000; // Large write buffer - options.large_value_threshold = 1048576; Reopen(&options); Random rnd(301); @@ -570,65 +544,53 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) { } TEST(DBTest, ApproximateSizes) { - for (int test = 0; test < 2; test++) { - // test==0: default large_value_threshold - // test==1: 1 MB large_value_threshold - Options options; - options.large_value_threshold = (test == 0) ? 65536 : 1048576; - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - DestroyAndReopen(); + Options options; + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + DestroyAndReopen(); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - Reopen(&options); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + Reopen(&options); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - const int N = 80; - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); - } - if (test == 1) { - // 0 because GetApproximateSizes() does not account for memtable space for - // non-large values - ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - } else { - ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000)); - ASSERT_TRUE(Between(Size(Key(20), Key(30)), - 100000*10, 100000*10 + 10000)); - } + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + const int N = 80; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); + } - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - for (int compact_start = 0; compact_start < N; compact_start += 10) { - for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); - ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), - 100000 * (i+1), 100000 * (i+1) + 10000)); - ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), - 100000 * 10, 100000 * 10 + 10000)); - } - ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); - ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - dbfull()->TEST_CompactRange(0, - Key(compact_start), - Key(compact_start + 9)); + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); + ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), + 100000 * (i+1), 100000 * (i+1) + 10000)); + ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), + 100000 * 10, 100000 * 10 + 10000)); } + ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); + ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 0); + dbfull()->TEST_CompactRange(0, + Key(compact_start), + Key(compact_start + 9)); } + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); } } TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { Options options; - options.large_value_threshold = 65536; options.compression = kNoCompression; Reopen(); @@ -801,146 +763,6 @@ TEST(DBTest, ComparatorCheck) { << s.ToString(); } -static bool LargeValuesOK(DBTest* db, - const std::set& expected) { - std::set actual = db->LargeValueFiles(); - if (actual.size() != expected.size()) { - fprintf(stderr, "Sets differ in size: %d vs %d\n", - (int)actual.size(), (int)expected.size()); - return false; - } - for (std::set::const_iterator it = expected.begin(); - it != expected.end(); - ++it) { - if (actual.count(*it) != 1) { - fprintf(stderr, " key '%s' not found in actual set\n", - LargeValueRefToFilenameString(*it).c_str()); - return false; - } - } - return true; -} - -TEST(DBTest, LargeValues1) { - Options options; - options.large_value_threshold = 10000; - Reopen(&options); - - Random rnd(301); - - std::string big1; - test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible - std::set expected; - - ASSERT_OK(Put("big1", big1)); - expected.insert(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Delete("big1")); - ASSERT_TRUE(LargeValuesOK(this, expected)); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - // No handling of deletion markers on memtable compactions, so big1 remains - ASSERT_TRUE(LargeValuesOK(this, expected)); - - dbfull()->TEST_CompactRange(0, "", "z"); - expected.erase(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_TRUE(LargeValuesOK(this, expected)); -} - -static bool SnappyCompressionSupported() { - std::string out; - Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Snappy_Compress(in.data(), in.size(), &out); -} - -TEST(DBTest, LargeValues2) { - Options options; - options.large_value_threshold = 10000; - Reopen(&options); - - Random rnd(301); - - std::string big1, big2; - test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible - test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible - std::set expected; - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Put("big1", big1)); - expected.insert(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_EQ(big1, Get("big1")); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Put("big2", big2)); - ASSERT_EQ(big2, Get("big2")); - if (SnappyCompressionSupported()) { - expected.insert(LargeValueRef::Make(big2, kSnappyCompression)); - } else { - expected.insert(LargeValueRef::Make(big2, kNoCompression)); - } - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Put("big2", big2)); - ASSERT_OK(Put("big2_b", big2)); - ASSERT_EQ(big1, Get("big1")); - ASSERT_EQ(big2, Get("big2")); - ASSERT_EQ(big2, Get("big2_b")); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(Delete("big1")); - ASSERT_EQ("NOT_FOUND", Get("big1")); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_TRUE(LargeValuesOK(this, expected)); - dbfull()->TEST_CompactRange(0, "", "z"); - expected.erase(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_TRUE(LargeValuesOK(this, expected)); - dbfull()->TEST_CompactRange(1, "", "z"); - - ASSERT_OK(Delete("big2")); - ASSERT_EQ("NOT_FOUND", Get("big2")); - ASSERT_EQ(big2, Get("big2_b")); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_TRUE(LargeValuesOK(this, expected)); - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_TRUE(LargeValuesOK(this, expected)); - - // Make sure the large value refs survive a reload and compactions after - // the reload. - Reopen(); - ASSERT_TRUE(LargeValuesOK(this, expected)); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - ASSERT_TRUE(LargeValuesOK(this, expected)); -} - -TEST(DBTest, LargeValues3) { - // Make sure we don't compress values if - Options options; - options.large_value_threshold = 10000; - options.compression = kNoCompression; - Reopen(&options); - - Random rnd(301); - - std::string big1 = std::string(100000, 'x'); // Very compressible - std::set expected; - - ASSERT_OK(Put("big1", big1)); - ASSERT_EQ(big1, Get("big1")); - expected.insert(LargeValueRef::Make(big1, kNoCompression)); - ASSERT_TRUE(LargeValuesOK(this, expected)); -} - - TEST(DBTest, DBOpen_Options) { std::string dbname = test::TmpDir() + "/db_options_test"; DestroyDB(dbname, Options()); @@ -1025,9 +847,6 @@ class ModelDB: public DB { case kTypeValue: map_[it.key().ToString()] = it.value().ToString(); break; - case kTypeLargeValueRef: - assert(false); // Should not occur - break; case kTypeDeletion: map_.erase(it.key().ToString()); break; diff --git a/db/dbformat.cc b/db/dbformat.cc index 2664eb4..c12c138 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -84,69 +84,4 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const { } } -LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) { - LargeValueRef result; - port::SHA1_Hash(value.data(), value.size(), &result.data[0]); - EncodeFixed64(&result.data[20], value.size()); - result.data[28] = static_cast(ctype); - return result; -} - -std::string LargeValueRefToFilenameString(const LargeValueRef& h) { - assert(sizeof(h.data) == LargeValueRef::ByteSize()); - assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf - static const char tohex[] = "0123456789abcdef"; - char buf[20*2]; - for (int i = 0; i < 20; i++) { - buf[2*i] = tohex[(h.data[i] >> 4) & 0xf]; - buf[2*i+1] = tohex[h.data[i] & 0xf]; - } - std::string result = std::string(buf, sizeof(buf)); - result += "-"; - result += NumberToString(h.ValueSize()); - result += "-"; - result += NumberToString(static_cast(h.compression_type())); - return result; -} - -static uint32_t hexvalue(char c) { - if (c >= '0' && c <= '9') { - return c - '0'; - } else if (c >= 'A' && c <= 'F') { - return 10 + c - 'A'; - } else { - assert(c >= 'a' && c <= 'f'); - return 10 + c - 'a'; - } -} - -bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) { - Slice in = s; - if (in.size() < 40) { - return false; - } - for (int i = 0; i < 20; i++) { - if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) { - return false; - } - unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]); - h->data[i] = c; - } - in.remove_prefix(40); - uint64_t value_size, ctype; - - if (ConsumeChar(&in, '-') && - ConsumeDecimalNumber(&in, &value_size) && - ConsumeChar(&in, '-') && - ConsumeDecimalNumber(&in, &ctype) && - in.empty() && - (ctype <= kSnappyCompression)) { - EncodeFixed64(&h->data[20], value_size); - h->data[28] = static_cast(ctype); - return true; - } else { - return false; - } -} - } diff --git a/db/dbformat.h b/db/dbformat.h index 5f117f9..d583665 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -29,7 +29,6 @@ class InternalKey; enum ValueType { kTypeDeletion = 0x0, kTypeValue = 0x1, - kTypeLargeValueRef = 0x2, }; // kValueTypeForSeek defines the ValueType that should be passed when // constructing a ParsedInternalKey object for seeking to a particular @@ -37,7 +36,7 @@ enum ValueType { // and the value type is embedded as the low 8 bits in the sequence // number in internal keys, we need to use the highest-numbered // ValueType, not the lowest). -static const ValueType kValueTypeForSeek = kTypeLargeValueRef; +static const ValueType kValueTypeForSeek = kTypeValue; typedef uint64_t SequenceNumber; @@ -139,54 +138,6 @@ inline int InternalKeyComparator::Compare( return Compare(a.Encode(), b.Encode()); } -// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte -// uncompressed size, and a 1 byte CompressionType code. An -// encoded form of it is embedded in the filenames of large value -// files stored in the database, and the raw binary form is stored as -// the iter->value() result for values of type kTypeLargeValueRef in -// the table and log files that make up the database. -struct LargeValueRef { - char data[29]; - - // Initialize a large value ref for the given data - static LargeValueRef Make(const Slice& data, - CompressionType compression_type); - - // Initialize a large value ref from a serialized, 29-byte reference value - static LargeValueRef FromRef(const Slice& ref) { - LargeValueRef result; - assert(ref.size() == sizeof(result.data)); - memcpy(result.data, ref.data(), sizeof(result.data)); - return result; - } - - // Return the number of bytes in a LargeValueRef (not the - // number of bytes in the value referenced). - static size_t ByteSize() { return sizeof(LargeValueRef().data); } - - // Return the number of bytes in the value referenced by "*this". - uint64_t ValueSize() const { return DecodeFixed64(&data[20]); } - - CompressionType compression_type() const { - return static_cast(data[28]); - } - - bool operator==(const LargeValueRef& b) const { - return memcmp(data, b.data, sizeof(data)) == 0; - } - bool operator<(const LargeValueRef& b) const { - return memcmp(data, b.data, sizeof(data)) < 0; - } -}; - -// Convert the large value ref to a human-readable string suitable -// for embedding in a large value filename. -extern std::string LargeValueRefToFilenameString(const LargeValueRef& h); - -// Parse the large value filename string in "input" and store it in -// "*h". If successful, returns true. Otherwise returns false. -extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref); - inline bool ParseInternalKey(const Slice& internal_key, ParsedInternalKey* result) { const size_t n = internal_key.size(); @@ -196,7 +147,7 @@ inline bool ParseInternalKey(const Slice& internal_key, result->sequence = num >> 8; result->type = static_cast(c); result->user_key = Slice(internal_key.data(), n - 8); - return (c <= static_cast(kTypeLargeValueRef)); + return (c <= static_cast(kTypeValue)); } } diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 702cbb4..57c5578 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -76,9 +76,6 @@ TEST(FormatTest, InternalKeyShortSeparator) { ASSERT_EQ(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeDeletion))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeLargeValueRef))); // When user keys are misordered ASSERT_EQ(IKey("foo", 100, kTypeValue), @@ -108,18 +105,6 @@ TEST(FormatTest, InternalKeyShortestSuccessor) { ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); } -TEST(FormatTest, SHA1) { - // Check that we are computing the same value as sha1. - // Note that the last two numbers are the length of the input and the - // compression type. - ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr - LargeValueRefToFilenameString( - LargeValueRef::Make("hello", kNoCompression))); - ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr - LargeValueRefToFilenameString( - LargeValueRef::Make("hello", kSnappyCompression))); -} - } int main(int argc, char** argv) { diff --git a/db/filename.cc b/db/filename.cc index d21918c..b3a917c 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -30,14 +30,6 @@ std::string TableFileName(const std::string& name, uint64_t number) { return MakeFileName(name, number, "sst"); } -std::string LargeValueFileName(const std::string& name, - const LargeValueRef& large_ref) { - std::string result = name + "/"; - result += LargeValueRefToFilenameString(large_ref); - result += ".val"; - return result; -} - std::string DescriptorFileName(const std::string& dbname, uint64_t number) { assert(number > 0); char buf[100]; @@ -75,11 +67,9 @@ std::string OldInfoLogFileName(const std::string& dbname) { // dbname/LOG // dbname/LOG.old // dbname/MANIFEST-[0-9]+ -// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val // dbname/[0-9]+.(log|sst) bool ParseFileName(const std::string& fname, uint64_t* number, - LargeValueRef* large_ref, FileType* type) { Slice rest(fname); if (rest == "CURRENT") { @@ -91,15 +81,6 @@ bool ParseFileName(const std::string& fname, } else if (rest == "LOG" || rest == "LOG.old") { *number = 0; *type = kInfoLogFile; - } else if (rest.size() >= 4 && - Slice(rest.data() + rest.size() - 4, 4) == ".val") { - LargeValueRef h; - if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4), - &h)) { - return false; - } - *large_ref = h; - *type = kLargeValueFile; } else if (rest.starts_with("MANIFEST-")) { rest.remove_prefix(strlen("MANIFEST-")); uint64_t num; diff --git a/db/filename.h b/db/filename.h index 81ab2fc..6a99744 100644 --- a/db/filename.h +++ b/db/filename.h @@ -16,13 +16,11 @@ namespace leveldb { class Env; -struct LargeValueRef; enum FileType { kLogFile, kDBLockFile, kTableFile, - kLargeValueFile, kDescriptorFile, kCurrentFile, kTempFile, @@ -39,12 +37,6 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number); // "dbname". extern std::string TableFileName(const std::string& dbname, uint64_t number); -// Return the name of the large value file with the specified large -// value reference in the db named by "dbname". The result will be -// prefixed with "dbname". -extern std::string LargeValueFileName(const std::string& dbname, - const LargeValueRef& large_ref); - // Return the name of the descriptor file for the db named by // "dbname" and the specified incarnation number. The result will be // prefixed with "dbname". @@ -71,14 +63,10 @@ extern std::string InfoLogFileName(const std::string& dbname); extern std::string OldInfoLogFileName(const std::string& dbname); // If filename is a leveldb file, store the type of the file in *type. -// If *type is kLargeValueFile, then the large value reference data -// from the filename is stored in "*large_ref. For all other types of -// files, the number encoded in the filename is stored in *number. If -// the filename was successfully parsed, returns true. Else return -// false. +// The number encoded in the filename is stored in *number. If the +// filename was successfully parsed, returns true. Else return false. extern bool ParseFileName(const std::string& filename, uint64_t* number, - LargeValueRef* large_ref, FileType* type); // Make the CURRENT file point to the descriptor file with the diff --git a/db/filename_test.cc b/db/filename_test.cc index 4d2a91e..2f61e8d 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -17,42 +17,29 @@ TEST(FileNameTest, Parse) { Slice db; FileType type; uint64_t number; - LargeValueRef large_ref; // Successful parses static struct { const char* fname; uint64_t number; - const char* large_ref; FileType type; } cases[] = { - { "100.log", 100, "", kLogFile }, - { "0.log", 0, "", kLogFile }, - { "0.sst", 0, "", kTableFile }, - { "CURRENT", 0, "", kCurrentFile }, - { "LOCK", 0, "", kDBLockFile }, - { "MANIFEST-2", 2, "", kDescriptorFile }, - { "MANIFEST-7", 7, "", kDescriptorFile }, - { "LOG", 0, "", kInfoLogFile }, - { "LOG.old", 0, "", kInfoLogFile }, - { "18446744073709551615.log", 18446744073709551615ull, "", - kLogFile }, - { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0, - "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile }, - { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0, - "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0", - kLargeValueFile }, + { "100.log", 100, kLogFile }, + { "0.log", 0, kLogFile }, + { "0.sst", 0, kTableFile }, + { "CURRENT", 0, kCurrentFile }, + { "LOCK", 0, kDBLockFile }, + { "MANIFEST-2", 2, kDescriptorFile }, + { "MANIFEST-7", 7, kDescriptorFile }, + { "LOG", 0, kInfoLogFile }, + { "LOG.old", 0, kInfoLogFile }, + { "18446744073709551615.log", 18446744073709551615ull, kLogFile }, }; for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { std::string f = cases[i].fname; - ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f; + ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; ASSERT_EQ(cases[i].type, type) << f; - if (type == kLargeValueFile) { - ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref)) - << f; - } else { - ASSERT_EQ(cases[i].number, number) << f; - } + ASSERT_EQ(cases[i].number, number) << f; } // Errors @@ -78,75 +65,54 @@ TEST(FileNameTest, Parse) { "184467440737095516150.log", "100", "100.", - "100.lop", - "100.val", - ".val", - "123456789012345678901234567890123456789-12340.val", - "1234567890123456789012345678901234567-123-0.val", - "12345678901234567890123456789012345678902-100-1-.val", - // Overflow on value size - "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val", - // '03.val' is a bad compression type - "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" }; + "100.lop" + }; for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { std::string f = errors[i]; - ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f; + ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; }; } TEST(FileNameTest, Construction) { uint64_t number; FileType type; - LargeValueRef large_ref; std::string fname; fname = CurrentFileName("foo"); ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(0, number); ASSERT_EQ(kCurrentFile, type); fname = LockFileName("foo"); ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(0, number); ASSERT_EQ(kDBLockFile, type); fname = LogFileName("foo", 192); ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(192, number); ASSERT_EQ(kLogFile, type); fname = TableFileName("bar", 200); ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(200, number); ASSERT_EQ(kTableFile, type); fname = DescriptorFileName("bar", 100); ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(100, number); ASSERT_EQ(kDescriptorFile, type); fname = TempFileName("tmp", 999); ASSERT_EQ("tmp/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(999, number); ASSERT_EQ(kTempFile, type); - - for (int i = 0; i <= kSnappyCompression; i++) { - CompressionType ctype = static_cast(i); - std::string value = "abcdef"; - LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype); - fname = LargeValueFileName("tmp", real_large_ref); - ASSERT_EQ("tmp/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type)); - ASSERT_TRUE(real_large_ref == large_ref); - ASSERT_EQ(kLargeValueFile, type); - ASSERT_EQ(large_ref.compression_type(), ctype); - } } } diff --git a/db/log_writer.cc b/db/log_writer.cc index 18ca37a..1696851 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -46,9 +46,9 @@ Status Writer::AddRecord(const Slice& slice) { } // Invariant: we never leave < kHeaderSize bytes in a block. - const int avail = kBlockSize - block_offset_ - kHeaderSize; - assert(avail >= 0); + assert(kBlockSize - block_offset_ - kHeaderSize >= 0); + const size_t avail = kBlockSize - block_offset_ - kHeaderSize; const size_t fragment_length = (left < avail) ? left : avail; RecordType type; diff --git a/db/repair.cc b/db/repair.cc index 014e00e..c8e7b9e 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -6,8 +6,7 @@ // (1) Any log files are first converted to tables // (2) We scan every table to compute // (a) smallest/largest for the table -// (b) large value refs from the table -// (c) largest sequence number in the table +// (b) largest sequence number in the table // (3) We generate descriptor contents: // - log number is set to zero // - next-file-number is set to 1 + largest file number we found @@ -22,9 +21,8 @@ // (c) For each table: if it overlaps earlier table, place in level-0, // else place in level-M. // Possible optimization 2: -// Store per-table metadata (smallest, largest, largest-seq#, -// large-value-refs, ...) in the table's meta section to speed up -// ScanTable. +// Store per-table metadata (smallest, largest, largest-seq#, ...) +// in the table's meta section to speed up ScanTable. #include "db/builder.h" #include "db/db_impl.h" @@ -73,7 +71,7 @@ class Repairer { } if (status.ok()) { unsigned long long bytes = 0; - for (int i = 0; i < tables_.size(); i++) { + for (size_t i = 0; i < tables_.size(); i++) { bytes += tables_[i].meta.file_size; } Log(env_, options_.info_log, @@ -119,13 +117,10 @@ class Repairer { } uint64_t number; - LargeValueRef large_ref; FileType type; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &large_ref, &type)) { - if (type == kLargeValueFile) { - // Will be picked up when we process a Table that points to it - } else if (type == kDescriptorFile) { + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (type == kDescriptorFile) { manifests_.push_back(filenames[i]); } else { if (number + 1 > next_file_number_) { @@ -145,7 +140,7 @@ class Repairer { } void ConvertLogFilesToTables() { - for (int i = 0; i < logs_.size(); i++) { + for (size_t i = 0; i < logs_.size(); i++) { std::string logname = LogFileName(dbname_, logs_[i]); Status status = ConvertLogToTable(logs_[i]); if (!status.ok()) { @@ -239,7 +234,7 @@ class Repairer { void ExtractMetaData() { std::vector kept; - for (int i = 0; i < table_numbers_.size(); i++) { + for (size_t i = 0; i < table_numbers_.size(); i++) { TableInfo t; t.meta.number = table_numbers_[i]; Status status = ScanTable(&t); @@ -283,17 +278,6 @@ class Repairer { if (parsed.sequence > t->max_sequence) { t->max_sequence = parsed.sequence; } - - if (ExtractValueType(key) == kTypeLargeValueRef) { - if (iter->value().size() != LargeValueRef::ByteSize()) { - Log(env_, options_.info_log, "Table #%llu: bad large value ref", - (unsigned long long) t->meta.number); - } else { - edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()), - t->meta.number, - key); - } - } } if (!iter->status().ok()) { status = iter->status(); @@ -316,7 +300,7 @@ class Repairer { } SequenceNumber max_sequence = 0; - for (int i = 0; i < tables_.size(); i++) { + for (size_t i = 0; i < tables_.size(); i++) { if (max_sequence < tables_[i].max_sequence) { max_sequence = tables_[i].max_sequence; } @@ -327,7 +311,7 @@ class Repairer { edit_.SetNextFile(next_file_number_); edit_.SetLastSequence(max_sequence); - for (int i = 0; i < tables_.size(); i++) { + for (size_t i = 0; i < tables_.size(); i++) { // TODO(opt): separate out into multiple levels const TableInfo& t = tables_[i]; edit_.AddFile(0, t.meta.number, t.meta.file_size, @@ -351,7 +335,7 @@ class Repairer { env_->DeleteFile(tmp); } else { // Discard older manifests - for (int i = 0; i < manifests_.size(); i++) { + for (size_t i = 0; i < manifests_.size(); i++) { ArchiveFile(dbname_ + "/" + manifests_[i]); } diff --git a/db/version_edit.cc b/db/version_edit.cc index 689dbe0..3941271 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -19,7 +19,7 @@ enum Tag { kCompactPointer = 5, kDeletedFile = 6, kNewFile = 7, - kLargeValueRef = 8, + // 8 was used for large value refs kPrevLogNumber = 9, }; @@ -36,7 +36,6 @@ void VersionEdit::Clear() { has_last_sequence_ = false; deleted_files_.clear(); new_files_.clear(); - large_refs_added_.clear(); } void VersionEdit::EncodeTo(std::string* dst) const { @@ -61,7 +60,7 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutVarint64(dst, last_sequence_); } - for (int i = 0; i < compact_pointers_.size(); i++) { + for (size_t i = 0; i < compact_pointers_.size(); i++) { PutVarint32(dst, kCompactPointer); PutVarint32(dst, compact_pointers_[i].first); // level PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); @@ -75,7 +74,7 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutVarint64(dst, iter->second); // file number } - for (int i = 0; i < new_files_.size(); i++) { + for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; PutVarint32(dst, kNewFile); PutVarint32(dst, new_files_[i].first); // level @@ -84,15 +83,6 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode()); } - - for (int i = 0; i < large_refs_added_.size(); i++) { - const VersionEdit::Large& l = large_refs_added_[i]; - PutVarint32(dst, kLargeValueRef); - PutLengthPrefixedSlice(dst, - Slice(l.large_ref.data, LargeValueRef::ByteSize())); - PutVarint64(dst, l.fnum); - PutLengthPrefixedSlice(dst, l.internal_key.Encode()); - } } static bool GetInternalKey(Slice* input, InternalKey* dst) { @@ -127,7 +117,6 @@ Status VersionEdit::DecodeFrom(const Slice& src) { uint64_t number; FileMetaData f; Slice str; - Large large; InternalKey key; while (msg == NULL && GetVarint32(&input, &tag)) { @@ -203,18 +192,6 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } break; - case kLargeValueRef: - if (GetLengthPrefixedSlice(&input, &str) && - (str.size() == LargeValueRef::ByteSize()) && - GetVarint64(&input, &large.fnum) && - GetInternalKey(&input, &large.internal_key)) { - large.large_ref = LargeValueRef::FromRef(str); - large_refs_added_.push_back(large); - } else { - msg = "large ref"; - } - break; - default: msg = "unknown tag"; break; @@ -255,7 +232,7 @@ std::string VersionEdit::DebugString() const { r.append("\n LastSeq: "); AppendNumberTo(&r, last_sequence_); } - for (int i = 0; i < compact_pointers_.size(); i++) { + for (size_t i = 0; i < compact_pointers_.size(); i++) { r.append("\n CompactPointer: "); AppendNumberTo(&r, compact_pointers_[i].first); r.append(" '"); @@ -270,7 +247,7 @@ std::string VersionEdit::DebugString() const { r.append(" "); AppendNumberTo(&r, iter->second); } - for (int i = 0; i < new_files_.size(); i++) { + for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; r.append("\n AddFile: "); AppendNumberTo(&r, new_files_[i].first); @@ -284,16 +261,6 @@ std::string VersionEdit::DebugString() const { AppendEscapedStringTo(&r, f.largest.Encode()); r.append("'"); } - for (int i = 0; i < large_refs_added_.size(); i++) { - const VersionEdit::Large& l = large_refs_added_[i]; - r.append("\n LargeRef: "); - AppendNumberTo(&r, l.fnum); - r.append(" "); - r.append(LargeValueRefToFilenameString(l.large_ref)); - r.append(" '"); - AppendEscapedStringTo(&r, l.internal_key.Encode()); - r.append("'"); - } r.append("\n}\n"); return r; } diff --git a/db/version_edit.h b/db/version_edit.h index 7e417b5..ab874da 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -75,18 +75,6 @@ class VersionEdit { deleted_files_.insert(std::make_pair(level, file)); } - // Record that a large value with the specified large_ref was - // written to the output file numbered "fnum" - void AddLargeValueRef(const LargeValueRef& large_ref, - uint64_t fnum, - const Slice& internal_key) { - large_refs_added_.resize(large_refs_added_.size() + 1); - Large* large = &(large_refs_added_.back()); - large->large_ref = large_ref; - large->fnum = fnum; - large->internal_key.DecodeFrom(internal_key); - } - void EncodeTo(std::string* dst) const; Status DecodeFrom(const Slice& src); @@ -111,12 +99,6 @@ class VersionEdit { std::vector< std::pair > compact_pointers_; DeletedFileSet deleted_files_; std::vector< std::pair > new_files_; - struct Large { - LargeValueRef large_ref; - uint64_t fnum; - InternalKey internal_key; - }; - std::vector large_refs_added_; }; } diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 6906ec3..67959f7 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -26,13 +26,9 @@ TEST(VersionEditTest, EncodeDecode) { for (int i = 0; i < 4; i++) { TestEncodeDecode(edit); edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, - InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef), + InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); edit.DeleteFile(4, kBig + 700 + i); - edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression), - kBig + 800 + i, "foobar"); - edit.AddLargeValueRef(LargeValueRef::Make("big2", kSnappyCompression), - kBig + 801 + i, "baz"); edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); } diff --git a/db/version_set.cc b/db/version_set.cc index 31f79bb..c439f49 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -58,7 +58,7 @@ std::string IntSetToString(const std::set& s) { Version::~Version() { assert(refs_ == 0); for (int level = 0; level < config::kNumLevels; level++) { - for (int i = 0; i < files_[level].size(); i++) { + for (size_t i = 0; i < files_[level].size(); i++) { FileMetaData* f = files_[level][i]; assert(f->refs >= 0); f->refs--; @@ -134,7 +134,7 @@ class Version::LevelFileNumIterator : public Iterator { private: const InternalKeyComparator icmp_; const std::vector* const flist_; - int index_; + uint32_t index_; // Backing store for value(). Holds the file number and size. mutable char value_buf_[16]; @@ -164,7 +164,7 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, void Version::AddIterators(const ReadOptions& options, std::vector* iters) { // Merge all level zero files together since they may overlap - for (int i = 0; i < files_[0].size(); i++) { + for (size_t i = 0; i < files_[0].size(); i++) { iters->push_back( vset_->table_cache_->NewIterator( options, files_[0][i]->number, files_[0][i]->file_size)); @@ -201,7 +201,7 @@ std::string Version::DebugString() const { AppendNumberTo(&r, level); r.push_back(':'); const std::vector& files = files_[level]; - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { r.push_back(' '); AppendNumberTo(&r, files[i]->number); r.push_back(':'); @@ -232,7 +232,7 @@ class VersionSet::Builder { : vset_(vset) { for (int level = 0; level < config::kNumLevels; level++) { const std::vector& files = base->files_[level]; - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { FileMetaData* f = files[i]; f->refs++; files_[level].insert(std::make_pair(f->number, f)); @@ -258,7 +258,7 @@ class VersionSet::Builder { // Apply all of the edits in *edit to the current state. void Apply(VersionEdit* edit) { // Update compaction pointers - for (int i = 0; i < edit->compact_pointers_.size(); i++) { + for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { const int level = edit->compact_pointers_[i].first; vset_->compact_pointer_[level] = edit->compact_pointers_[i].second.Encode().ToString(); @@ -284,19 +284,13 @@ class VersionSet::Builder { } // Add new files - for (int i = 0; i < edit->new_files_.size(); i++) { + for (size_t i = 0; i < edit->new_files_.size(); i++) { const int level = edit->new_files_[i].first; FileMetaData* f = new FileMetaData(edit->new_files_[i].second); f->refs = 1; assert(files_[level].count(f->number) == 0); files_[level].insert(std::make_pair(f->number, f)); } - - // Add large value refs - for (int i = 0; i < edit->large_refs_added_.size(); i++) { - const VersionEdit::Large& l = edit->large_refs_added_[i]; - vset_->RegisterLargeValueRef(l.large_ref, l.fnum, l.internal_key); - } } // Save the current state in *v. @@ -545,7 +539,7 @@ Status VersionSet::Recover() { static int64_t TotalFileSize(const std::vector& files) { int64_t sum = 0; - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { sum += files[i]->file_size; } return sum; @@ -610,25 +604,12 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // Save files for (int level = 0; level < config::kNumLevels; level++) { const std::vector& files = current_->files_[level]; - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { const FileMetaData* f = files[i]; edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); } } - // Save large value refs - for (LargeValueMap::const_iterator it = large_value_refs_.begin(); - it != large_value_refs_.end(); - ++it) { - const LargeValueRef& ref = it->first; - const LargeReferencesSet& pointers = it->second; - for (LargeReferencesSet::const_iterator j = pointers.begin(); - j != pointers.end(); - ++j) { - edit.AddLargeValueRef(ref, j->first, j->second); - } - } - std::string record; edit.EncodeTo(&record); return log->AddRecord(record); @@ -651,7 +632,7 @@ Status VersionSet::SortLevel(Version* v, uint64_t level) { if (result.ok() && level > 0) { // There should be no overlap - for (int i = 1; i < v->files_[level].size(); i++) { + for (size_t i = 1; i < v->files_[level].size(); i++) { const InternalKey& prev_end = v->files_[level][i-1]->largest; const InternalKey& this_begin = v->files_[level][i]->smallest; if (icmp_.Compare(prev_end, this_begin) >= 0) { @@ -676,7 +657,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t result = 0; for (int level = 0; level < config::kNumLevels; level++) { const std::vector& files = v->files_[level]; - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { if (icmp_.Compare(files[i]->largest, ikey) <= 0) { // Entire file is before "ikey", so just add the file size result += files[i]->file_size; @@ -701,83 +682,9 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { } } } - - // Add in large value files which are references from internal keys - // stored in the table files - // - // TODO(opt): this is O(# large values in db). If this becomes too slow, - // we could store an auxiliary data structure indexed by internal key - for (LargeValueMap::const_iterator it = large_value_refs_.begin(); - it != large_value_refs_.end(); - ++it) { - const LargeValueRef& lref = it->first; - for (LargeReferencesSet::const_iterator it2 = it->second.begin(); - it2 != it->second.end(); - ++it2) { - if (icmp_.Compare(it2->second, ikey.Encode()) <= 0) { - // Internal key for large value is before our key of interest - result += lref.ValueSize(); - } - } - } - - return result; } -bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref, - uint64_t fnum, - const InternalKey& internal_key) { - LargeReferencesSet* refs = &large_value_refs_[large_ref]; - bool is_first = refs->empty(); - refs->insert(make_pair(fnum, internal_key.Encode().ToString())); - return is_first; -} - -void VersionSet::CleanupLargeValueRefs(const std::set& live_tables) { - for (LargeValueMap::iterator it = large_value_refs_.begin(); - it != large_value_refs_.end(); - ) { - LargeReferencesSet* refs = &it->second; - for (LargeReferencesSet::iterator ref_it = refs->begin(); - ref_it != refs->end(); - ) { - if (ref_it->first != log_number_ && // Not in log file - ref_it->first != prev_log_number_ && // Not in prev log - live_tables.count(ref_it->first) == 0) { // Not in a live table - // No longer live: erase - LargeReferencesSet::iterator to_erase = ref_it; - ++ref_it; - refs->erase(to_erase); - } else { - // Still live: leave this reference alone - ++ref_it; - } - } - if (refs->empty()) { - // No longer any live references to this large value: remove from - // large_value_refs - Log(env_, options_->info_log, "large value is dead: '%s'", - LargeValueRefToFilenameString(it->first).c_str()); - LargeValueMap::iterator to_erase = it; - ++it; - large_value_refs_.erase(to_erase); - } else { - ++it; - } - } -} - -bool VersionSet::LargeValueIsLive(const LargeValueRef& large_ref) { - LargeValueMap::iterator it = large_value_refs_.find(large_ref); - if (it == large_value_refs_.end()) { - return false; - } else { - assert(!it->second.empty()); - return true; - } -} - void VersionSet::MaybeDeleteOldVersions() { // Note: it is important to delete versions in order since a newer // version with zero refs may be holding a pointer to a memtable @@ -793,7 +700,7 @@ void VersionSet::AddLiveFiles(std::set* live) { for (Version* v = oldest_; v != NULL; v = v->next_) { for (int level = 0; level < config::kNumLevels; level++) { const std::vector& files = v->files_[level]; - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { live->insert(files[i]->number); } } @@ -810,7 +717,7 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() { int64_t result = 0; std::vector overlaps; for (int level = 0; level < config::kNumLevels - 1; level++) { - for (int i = 0; i < current_->files_[level].size(); i++) { + for (size_t i = 0; i < current_->files_[level].size(); i++) { const FileMetaData* f = current_->files_[level][i]; GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); const int64_t sum = TotalFileSize(overlaps); @@ -832,7 +739,7 @@ void VersionSet::GetOverlappingInputs( Slice user_begin = begin.user_key(); Slice user_end = end.user_key(); const Comparator* user_cmp = icmp_.user_comparator(); - for (int i = 0; i < current_->files_[level].size(); i++) { + for (size_t i = 0; i < current_->files_[level].size(); i++) { FileMetaData* f = current_->files_[level][i]; if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { @@ -852,7 +759,7 @@ void VersionSet::GetRange(const std::vector& inputs, assert(!inputs.empty()); smallest->Clear(); largest->Clear(); - for (int i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < inputs.size(); i++) { FileMetaData* f = inputs[i]; if (i == 0) { *smallest = f->smallest; @@ -895,7 +802,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { if (!c->inputs_[which].empty()) { if (c->level() + which == 0) { const std::vector& files = c->inputs_[which]; - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { list[num++] = table_cache_->NewIterator( options, files[i]->number, files[i]->file_size); } @@ -927,7 +834,7 @@ Compaction* VersionSet::PickCompaction() { c->input_version_->Ref(); // Pick the first file that comes after compact_pointer_[level] - for (int i = 0; i < current_->files_[level].size(); i++) { + for (size_t i = 0; i < current_->files_[level].size(); i++) { FileMetaData* f = current_->files_[level][i]; if (compact_pointer_[level].empty() || icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { @@ -1062,7 +969,7 @@ bool Compaction::IsTrivialMove() const { void Compaction::AddInputDeletions(VersionEdit* edit) { for (int which = 0; which < 2; which++) { - for (int i = 0; i < inputs_[which].size(); i++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { edit->DeleteFile(level_ + which, inputs_[which][i]->number); } } diff --git a/db/version_set.h b/db/version_set.h index e1c5a4b..e377513 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -171,22 +171,6 @@ class VersionSet { // "key" as of version "v". uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); - // Register a reference to a large value with the specified - // large_ref from the specified file number. Returns "true" if this - // is the first recorded reference to the "large_ref" value in the - // database, and false otherwise. - bool RegisterLargeValueRef(const LargeValueRef& large_ref, - uint64_t filenum, - const InternalKey& internal_key); - - // Cleanup the large value reference state by eliminating any - // references from files that are not includes in either "live_tables" - // or the current log. - void CleanupLargeValueRefs(const std::set& live_tables); - - // Returns true if a large value with the given reference is live. - bool LargeValueIsLive(const LargeValueRef& large_ref); - private: class Builder; @@ -237,14 +221,6 @@ class VersionSet { Version* current_; // Pointer to the last (newest) list entry Version* oldest_; // Pointer to the first (oldest) list entry - // Map from large value reference to the set of - // values containing references to the value. We keep the - // internal key as a std::string rather than as an InternalKey because - // we want to be able to easily use a set. - typedef std::set > LargeReferencesSet; - typedef std::map LargeValueMap; - LargeValueMap large_value_refs_; - // Per-level key at which the next compaction at that level should start. // Either an empty string, or a valid InternalKey. std::string compact_pointer_[config::kNumLevels]; @@ -313,7 +289,7 @@ class Compaction { // State used to check for number of of overlapping grandparent files // (parent == level_ + 1, grandparent == level_ + 2) std::vector grandparents_; - int grandparent_index_; // Index in grandparent_starts_ + size_t grandparent_index_; // Index in grandparent_starts_ bool seen_key_; // Some output key has been seen int64_t overlapped_bytes_; // Bytes of overlap between current output // and grandparent files @@ -324,7 +300,7 @@ class Compaction { // is that we are positioned at one of the file ranges for each // higher level than the ones involved in this compaction (i.e. for // all L >= level_ + 2). - int level_ptrs_[config::kNumLevels]; + size_t level_ptrs_[config::kNumLevels]; }; } diff --git a/db/write_batch.cc b/db/write_batch.cc index e84e548..d561528 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -8,7 +8,6 @@ // data: record[count] // record := // kTypeValue varstring varstring | -// kTypeLargeValueRef varstring varstring | // kTypeDeletion varstring // varstring := // len: varint32 @@ -58,16 +57,6 @@ void WriteBatch::Put(const Slice& key, const Slice& value) { PutLengthPrefixedSlice(&rep_, value); } -void WriteBatchInternal::PutLargeValueRef(WriteBatch* b, - const Slice& key, - const LargeValueRef& large_ref) { - WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); - b->rep_.push_back(static_cast(kTypeLargeValueRef)); - PutLengthPrefixedSlice(&b->rep_, key); - PutLengthPrefixedSlice(&b->rep_, - Slice(large_ref.data, sizeof(large_ref.data))); -} - void WriteBatch::Delete(const Slice& key) { WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); rep_.push_back(static_cast(kTypeDeletion)); @@ -87,10 +76,6 @@ Status WriteBatchInternal::InsertInto(const WriteBatch* b, case kTypeValue: memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); break; - case kTypeLargeValueRef: - memtable->Add(it.sequence_number(), kTypeLargeValueRef, - it.key(), it.value()); - break; } found++; } @@ -134,7 +119,6 @@ void WriteBatchInternal::Iterator::GetNextEntry() { input_.remove_prefix(1); switch (tag) { case kTypeValue: - case kTypeLargeValueRef: if (GetLengthPrefixedSlice(&input_, &key_) && GetLengthPrefixedSlice(&input_, &value_)) { op_ = static_cast(tag); diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index ea28e2d..ab0a823 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -13,10 +13,6 @@ namespace leveldb { // WriteBatch that we don't want in the public WriteBatch interface. class WriteBatchInternal { public: - static void PutLargeValueRef(WriteBatch* batch, - const Slice& key, - const LargeValueRef& large_ref); - // Return the number of entries in the batch. static int Count(const WriteBatch* batch); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index deb8411..2bf1134 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -29,13 +29,6 @@ static std::string PrintContents(WriteBatch* b) { state.append(iter->value().ToString()); state.append(")"); break; - case kTypeLargeValueRef: - state.append("PutRef("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - break; case kTypeDeletion: state.append("Delete("); state.append(ikey.user_key.ToString()); @@ -74,22 +67,6 @@ TEST(WriteBatchTest, Multiple) { PrintContents(&batch)); } -TEST(WriteBatchTest, PutIndirect) { - WriteBatch batch; - batch.Put(Slice("baz"), Slice("boo")); - LargeValueRef h; - for (int i = 0; i < LargeValueRef::ByteSize(); i++) { - h.data[i] = (i < 20) ? 'a' : 'b'; - } - WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h); - WriteBatchInternal::SetSequence(&batch, 100); - ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); - ASSERT_EQ(2, WriteBatchInternal::Count(&batch)); - ASSERT_EQ("Put(baz, boo)@100" - "PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101", - PrintContents(&batch)); -} - TEST(WriteBatchTest, Corruption) { WriteBatch batch; batch.Put(Slice("foo"), Slice("bar")); diff --git a/doc/impl.html b/doc/impl.html index b190d2c..dd09fea 100644 --- a/doc/impl.html +++ b/doc/impl.html @@ -57,15 +57,6 @@ These merges have the effect of gradually migrating new updates from the young level to the largest level using only bulk reads and writes (i.e., minimizing expensive seeks). -

Large value files

-

-Each large value (greater than 64KB by default) is placed in a large -value file (*.val) of its own. An entry is maintained in the log -and/or sorted tables that maps from the corresponding key to the -name of this large value file. The name of the large value file -is derived from a SHA1 hash of the value and its length so that -identical values share the same file. -

Manifest

A MANIFEST file lists the set of sorted tables that make up each @@ -220,9 +211,7 @@ So maybe even the sharding is not necessary on modern filesystems? compaction and at the end of recovery. It finds the names of all files in the database. It deletes all log files that are not the current log file. It deletes all table files that are not referenced -from some level and are not the output of an active compaction. It -deletes all large value files that are not referenced from any live -table or log file. +from some level and are not the output of an active compaction. diff --git a/doc/index.html b/doc/index.html index 2a83fc3..c2312b7 100644 --- a/doc/index.html +++ b/doc/index.html @@ -412,17 +412,6 @@ We might want to prefix filename keys with one letter (say '/') and over just the metadata do not force us to fetch and cache bulky file contents.

-

Large Values

-

-leveldb has special treatment of large values (by default, a value -of length greater than or equal to 64K is considered large, though a -field in Options can be used to adjust this threshold). Each such -large value is placed in a separate operating system file, and the -normal database blocks just contain pointers to such files. -

-Furthermore, if the same large value occurs multiple times in a single -database, it will be stored just once. -

Checksums

leveldb associates checksums with all data it stores in the file system. diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 87d388e..a94651f 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -86,16 +86,6 @@ struct Options { // Default: 1000 int max_open_files; - // Handle values larger than "large_value_threshold" bytes - // specially, by writing them into their own files (to avoid - // compaction overhead) and doing content-based elimination of - // duplicate values to save space. - // - // We recommend against changing this value. - // - // Default: 64K - size_t large_value_threshold; - // Control over blocks (user data is stored in a set of blocks, and // a block is the unit of reading from disk). @@ -110,7 +100,7 @@ struct Options { // compression is enabled. This parameter can be changed dynamically. // // Default: 4K - int block_size; + size_t block_size; // Number of keys between restart points for delta encoding of keys. // This parameter can be changed dynamically. Most clients should diff --git a/leveldb.gyp b/leveldb.gyp index d10ac33..20d1b1d 100644 --- a/leveldb.gyp +++ b/leveldb.gyp @@ -96,8 +96,6 @@ 'port/port_example.h', 'port/port_posix.cc', 'port/port_posix.h', - 'port/sha1_portable.cc', - 'port/sha1_portable.h', 'table/block.cc', 'table/block.h', 'table/block_builder.cc', @@ -267,16 +265,6 @@ 'db/log_test.cc', ], }, - { - 'target_name': 'leveldb_sha1_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'port/sha1_test.cc', - ], - }, { 'target_name': 'leveldb_skiplist_test', 'type': 'executable', diff --git a/port/port_android.h b/port/port_android.h index 8680951..13df9c9 100644 --- a/port/port_android.h +++ b/port/port_android.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -134,13 +133,6 @@ inline bool Snappy_Uncompress( return false; } -inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { - SHA1_CTX sha1_ctx; - SHA1Init(&sha1_ctx); - SHA1Update(&sha1_ctx, (const u_char*)data, len); - SHA1Final((u_char*)hash_array, &sha1_ctx); -} - inline uint64_t ThreadIdentifier() { pthread_t tid = pthread_self(); uint64_t r = 0; diff --git a/port/port_chromium.h b/port/port_chromium.h index e349f8f..1851e6e 100644 --- a/port/port_chromium.h +++ b/port/port_chromium.h @@ -13,7 +13,6 @@ #include "base/atomicops.h" #include "base/basictypes.h" #include "base/logging.h" -#include "base/sha1.h" #include "base/synchronization/condition_variable.h" #include "base/synchronization/lock.h" @@ -83,12 +82,6 @@ class AtomicPointer { } }; -inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { - return ::base::SHA1HashBytes(reinterpret_cast(data), - len, - reinterpret_cast(hash_array)); -} - bool Snappy_Compress(const char* input, size_t input_length, std::string* output); bool Snappy_Uncompress(const char* input_data, size_t input_length, diff --git a/port/port_example.h b/port/port_example.h index cf72617..8a624f3 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -89,11 +89,6 @@ class AtomicPointer { void NoBarrier_Store(void* v); }; -// ------------------ Checksumming ------------------- - -// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]" -extern void SHA1_Hash(const char* data, size_t len, char* hash_array); - // ------------------ Compression ------------------- // Store the snappy compression of "input[0,input_length-1]" in *output. diff --git a/port/port_posix.h b/port/port_posix.h index 7adbc01..c158db1 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -13,7 +13,6 @@ #include #include #include -#include "port/sha1_portable.h" namespace leveldb { namespace port { @@ -73,10 +72,6 @@ class AtomicPointer { } }; -inline void SHA1_Hash(const char* data, size_t len, char* hash_array) { - SHA1_Hash_Portable(data, len, hash_array); -} - // TODO(gabor): Implement actual compress inline bool Snappy_Compress(const char* input, size_t input_length, std::string* output) { diff --git a/table/block.cc b/table/block.cc index 0525d2d..92b2877 100644 --- a/table/block.cc +++ b/table/block.cc @@ -62,7 +62,9 @@ static inline const char* DecodeEntry(const char* p, const char* limit, if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; } - if (limit - p < (*non_shared + *value_length)) return NULL; + if (static_cast(limit - p) < (*non_shared + *value_length)) { + return NULL; + } return p; } diff --git a/table/block_builder.cc b/table/block_builder.cc index ae18b36..dc958c8 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -62,7 +62,7 @@ size_t BlockBuilder::CurrentSizeEstimate() const { Slice BlockBuilder::Finish() { // Append restart array - for (int i = 0; i < restarts_.size(); i++) { + for (size_t i = 0; i < restarts_.size(); i++) { PutFixed32(&buffer_, restarts_[i]); } PutFixed32(&buffer_, restarts_.size()); diff --git a/table/format.cc b/table/format.cc index 8c6b0f3..63971db 100644 --- a/table/format.cc +++ b/table/format.cc @@ -36,7 +36,7 @@ void Footer::EncodeTo(std::string* dst) const { metaindex_handle_.EncodeTo(dst); index_handle_.EncodeTo(dst); dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding - PutFixed32(dst, static_cast(kTableMagicNumber)); + PutFixed32(dst, static_cast(kTableMagicNumber & 0xffffffffu)); PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); assert(dst->size() == original_size + kEncodedLength); } @@ -71,7 +71,7 @@ Status ReadBlock(RandomAccessFile* file, // Read the block contents as well as the type/crc footer. // See table_builder.cc for the code that built this structure. - size_t n = handle.size(); + size_t n = static_cast(handle.size()); char* buf = new char[n + kBlockTrailerSize]; Slice contents; Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); diff --git a/util/arena.cc b/util/arena.cc index 4bf6e36..40ab99d 100644 --- a/util/arena.cc +++ b/util/arena.cc @@ -16,7 +16,7 @@ Arena::Arena() { } Arena::~Arena() { - for (int i = 0; i < blocks_.size(); i++) { + for (size_t i = 0; i < blocks_.size(); i++) { delete[] blocks_[i]; } } diff --git a/util/coding.cc b/util/coding.cc index 680e2ad..14f21f7 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -85,7 +85,7 @@ char* EncodeVarint64(char* dst, uint64_t v) { *(ptr++) = (v & (B-1)) | B; v >>= 7; } - *(ptr++) = v; + *(ptr++) = static_cast(v); return reinterpret_cast(ptr); } diff --git a/util/comparator.cc b/util/comparator.cc index e2b27e3..cc2b263 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -51,7 +51,7 @@ class BytewiseComparatorImpl : public Comparator { virtual void FindShortSuccessor(std::string* key) const { // Find first character that can be incremented size_t n = key->size(); - for (int i = 0; i < n; i++) { + for (size_t i = 0; i < n; i++) { const uint8_t byte = (*key)[i]; if (byte != static_cast(0xff)) { (*key)[i] = byte + 1; diff --git a/util/logging.cc b/util/logging.cc index 5c9bd4a..760d335 100644 --- a/util/logging.cc +++ b/util/logging.cc @@ -20,7 +20,7 @@ void AppendNumberTo(std::string* str, uint64_t num) { } void AppendEscapedStringTo(std::string* str, const Slice& value) { - for (int i = 0; i < value.size(); i++) { + for (size_t i = 0; i < value.size(); i++) { char c = value[i]; if (c >= ' ' && c <= '~') { str->push_back(c); diff --git a/util/options.cc b/util/options.cc index 29272fe..0ea5c98 100644 --- a/util/options.cc +++ b/util/options.cc @@ -18,7 +18,6 @@ Options::Options() info_log(NULL), write_buffer_size(4<<20), max_open_files(1000), - large_value_threshold(65536), block_cache(NULL), block_size(4096), block_restart_interval(16), diff --git a/util/random.h b/util/random.h index 2d458e8..d886b4e 100644 --- a/util/random.h +++ b/util/random.h @@ -29,7 +29,7 @@ class Random { uint64_t product = seed_ * A; // Compute (product % M) using the fact that ((x << 31) % M) == x. - seed_ = (product >> 31) + (product & M); + seed_ = static_cast((product >> 31) + (product & M)); // The first reduction may overflow by 1 bit, so we may need to // repeat. mod == M is not possible; using > allows the faster // sign-bit-based test. -- cgit v1.2.3 From e50d37879a15b8bbda1c962038a6f0569f9a980a Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Wed, 20 Apr 2011 22:50:04 +0000 Subject: fix build on at least linux git-svn-id: http://leveldb.googlecode.com/svn/trunk@25 62dab493-f737-651d-591e-8d6aee1b9529 --- table/block.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/table/block.cc b/table/block.cc index 92b2877..c20bb38 100644 --- a/table/block.cc +++ b/table/block.cc @@ -62,7 +62,7 @@ static inline const char* DecodeEntry(const char* p, const char* limit, if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL; } - if (static_cast(limit - p) < (*non_shared + *value_length)) { + if (static_cast(limit - p) < (*non_shared + *value_length)) { return NULL; } return p; -- cgit v1.2.3 From e16e59a48dea1925299763c2707981be02d92d6d Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Thu, 21 Apr 2011 01:54:51 +0000 Subject: pull in hans' mac build fix git-svn-id: http://leveldb.googlecode.com/svn/trunk@26 62dab493-f737-651d-591e-8d6aee1b9529 --- util/env_chromium.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/util/env_chromium.cc b/util/env_chromium.cc index 7edc7a9..fd3a4c7 100644 --- a/util/env_chromium.cc +++ b/util/env_chromium.cc @@ -31,7 +31,8 @@ #if defined(OS_MACOSX) || defined(OS_WIN) // The following are glibc-specific -extern "C" { +namespace { + size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { return fread(ptr, size, n, file); } @@ -51,6 +52,7 @@ int fdatasync(int fildes) { return fsync(fildes); #endif } + } #endif -- cgit v1.2.3 From 2b55d44e07d10d09da7b0c5ad67f252ac263764f Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Tue, 3 May 2011 03:10:59 +0000 Subject: make windows include /Iport\win in dependent projects git-svn-id: http://leveldb.googlecode.com/svn/trunk@27 62dab493-f737-651d-591e-8d6aee1b9529 --- leveldb.gyp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/leveldb.gyp b/leveldb.gyp index 20d1b1d..8376592 100644 --- a/leveldb.gyp +++ b/leveldb.gyp @@ -47,6 +47,13 @@ 'include_dirs': [ 'include/', ], + 'conditions': [ + ['OS == "win"', { + 'include_dirs': [ + 'port/win', + ], + }], + ], }, 'sources': [ # Include and then exclude so that all files show up in IDEs, even if -- cgit v1.2.3 From a5b4129c0a8c01158cde2244a5811f15b9d45ec0 Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Sat, 21 May 2011 02:17:43 +0000 Subject: sync with upstream @ 21409451 Check the NEWS file for details of what changed. git-svn-id: http://leveldb.googlecode.com/svn/trunk@28 62dab493-f737-651d-591e-8d6aee1b9529 --- NEWS | 17 +++ db/db_bench.cc | 66 +++++++-- db/db_impl.cc | 64 ++++++--- db/db_test.cc | 109 +++++++++++--- db/dbformat.h | 10 ++ db/log_reader.cc | 116 ++++++++++++--- db/log_reader.h | 37 ++++- db/log_test.cc | 143 ++++++++++++++++++- db/log_writer.cc | 3 +- db/memtable.cc | 14 +- db/memtable.h | 18 ++- db/repair.cc | 12 +- db/snapshot.h | 22 +-- db/version_set.cc | 309 ++++++++++++++++++++++------------------ db/version_set.h | 38 +++-- db/write_batch.cc | 136 ++++++++---------- db/write_batch_internal.h | 24 ---- db/write_batch_test.cc | 8 +- doc/impl.html | 26 ++-- doc/index.html | 16 ++- include/leveldb/comparator.h | 4 +- include/leveldb/db.h | 20 ++- include/leveldb/env.h | 12 ++ include/leveldb/iterator.h | 5 + include/leveldb/slice.h | 5 + include/leveldb/status.h | 36 +++-- include/leveldb/table.h | 3 +- include/leveldb/table_builder.h | 5 + include/leveldb/write_batch.h | 15 ++ table/block_builder.cc | 2 +- table/table_test.cc | 14 +- util/env_chromium.cc | 7 + util/env_posix.cc | 7 + util/status.cc | 36 +++-- 34 files changed, 953 insertions(+), 406 deletions(-) create mode 100644 NEWS diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..3fd9924 --- /dev/null +++ b/NEWS @@ -0,0 +1,17 @@ +Release 1.2 2011-05-16 +---------------------- + +Fixes for larger databases (tested up to one billion 100-byte entries, +i.e., ~100GB). + +(1) Place hard limit on number of level-0 files. This fixes errors +of the form "too many open files". + +(2) Fixed memtable management. Before the fix, a heavy write burst +could cause unbounded memory usage. + +A fix for a logging bug where the reader would incorrectly complain +about corruption. + +Allow public access to WriteBatch contents so that users can easily +wrap a DB. diff --git a/db/db_bench.cc b/db/db_bench.cc index d1cbdc0..b5fd679 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -24,9 +24,10 @@ // overwrite -- overwrite N values in random key order in async mode // fillsync -- write N/100 values in random key order in sync mode // fill100K -- write N/1000 100K values in random order in async mode -// readseq -- read N values sequentially -// readreverse -- read N values in reverse order -// readrandom -- read N values in random order +// readseq -- read N times sequentially +// readreverse -- read N times in reverse order +// readrandom -- read N times in random order +// readhot -- read N times in random order from 1% section of DB // crc32c -- repeated crc32c of 4K of data // Meta operations: // compact -- Compact the entire DB @@ -54,6 +55,9 @@ static const char* FLAGS_benchmarks = // Number of key/values to place in database static int FLAGS_num = 1000000; +// Number of read operations to do. If negative, do FLAGS_num reads. +static int FLAGS_reads = -1; + // Size of each value static int FLAGS_value_size = 100; @@ -72,6 +76,14 @@ static int FLAGS_write_buffer_size = 0; // Negative means use default settings. static int FLAGS_cache_size = -1; +// Maximum number of files to keep open at the same time (use default if == 0) +static int FLAGS_open_files = 0; + +// If true, do not destroy the existing database. If you set this +// flag and also specify a benchmark that wants a fresh database, that +// benchmark will fail. +static bool FLAGS_use_existing_db = false; + namespace leveldb { // Helper for quickly generating random data. @@ -126,6 +138,7 @@ class Benchmark { Cache* cache_; DB* db_; int num_; + int reads_; int heap_counter_; double start_; double last_op_finish_; @@ -298,6 +311,7 @@ class Benchmark { : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), db_(NULL), num_(FLAGS_num), + reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), heap_counter_(0), bytes_(0), rand_(301) { @@ -308,7 +322,9 @@ class Benchmark { Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); } } - DestroyDB("/tmp/dbbench", Options()); + if (!FLAGS_use_existing_db) { + DestroyDB("/tmp/dbbench", Options()); + } } ~Benchmark() { @@ -355,11 +371,13 @@ class Benchmark { ReadReverse(); } else if (name == Slice("readrandom")) { ReadRandom(); + } else if (name == Slice("readhot")) { + ReadHot(); } else if (name == Slice("readrandomsmall")) { - int n = num_; - num_ /= 1000; + int n = reads_; + reads_ /= 1000; ReadRandom(); - num_ = n; + reads_ = n; } else if (name == Slice("compact")) { Compact(); } else if (name == Slice("crc32c")) { @@ -449,7 +467,7 @@ class Benchmark { void Open() { assert(db_ == NULL); Options options; - options.create_if_missing = true; + options.create_if_missing = !FLAGS_use_existing_db; options.block_cache = cache_; options.write_buffer_size = FLAGS_write_buffer_size; Status s = DB::Open(options, "/tmp/dbbench", &db_); @@ -462,6 +480,10 @@ class Benchmark { void Write(const WriteOptions& options, Order order, DBState state, int num_entries, int value_size, int entries_per_batch) { if (state == FRESH) { + if (FLAGS_use_existing_db) { + message_ = "skipping (--use_existing_db is true)"; + return; + } delete db_; db_ = NULL; DestroyDB("/tmp/dbbench", Options()); @@ -499,7 +521,7 @@ class Benchmark { void ReadSequential() { Iterator* iter = db_->NewIterator(ReadOptions()); int i = 0; - for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) { + for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { bytes_ += iter->key().size() + iter->value().size(); FinishedSingleOp(); ++i; @@ -510,7 +532,7 @@ class Benchmark { void ReadReverse() { Iterator* iter = db_->NewIterator(ReadOptions()); int i = 0; - for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) { + for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { bytes_ += iter->key().size() + iter->value().size(); FinishedSingleOp(); ++i; @@ -521,7 +543,7 @@ class Benchmark { void ReadRandom() { ReadOptions options; std::string value; - for (int i = 0; i < num_; i++) { + for (int i = 0; i < reads_; i++) { char key[100]; const int k = rand_.Next() % FLAGS_num; snprintf(key, sizeof(key), "%016d", k); @@ -530,6 +552,19 @@ class Benchmark { } } + void ReadHot() { + ReadOptions options; + std::string value; + const int range = (FLAGS_num + 99) / 100; + for (int i = 0; i < reads_; i++) { + char key[100]; + const int k = rand_.Next() % range; + snprintf(key, sizeof(key), "%016d", k); + db_->Get(options, key, &value); + FinishedSingleOp(); + } + } + void Compact() { DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); @@ -582,6 +617,8 @@ class Benchmark { int main(int argc, char** argv) { FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; + FLAGS_open_files = leveldb::Options().max_open_files; + for (int i = 1; i < argc; i++) { double d; int n; @@ -593,14 +630,21 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_histogram = n; + } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_existing_db = n; } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { FLAGS_num = n; + } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { + FLAGS_reads = n; } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { FLAGS_value_size = n; } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { FLAGS_write_buffer_size = n; } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { FLAGS_cache_size = n; + } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) { + FLAGS_open_files = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/db/db_impl.cc b/db/db_impl.cc index 3b9e04e..baf9299 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -126,6 +126,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) log_(NULL), bg_compaction_scheduled_(false), compacting_(false) { + mem_->Ref(); has_imm_.Release_Store(NULL); // Reserve ten files or so for other uses and give the rest to TableCache. @@ -152,8 +153,8 @@ DBImpl::~DBImpl() { } delete versions_; - delete mem_; - delete imm_; + if (mem_ != NULL) mem_->Unref(); + if (imm_ != NULL) imm_->Unref(); delete log_; delete logfile_; delete table_cache_; @@ -344,7 +345,8 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, // paranoid_checks==false so that corruptions cause entire commits // to be skipped instead of propagating bad information (like overly // large sequence numbers). - log::Reader reader(file, &reporter, true/*checksum*/); + log::Reader reader(file, &reporter, true/*checksum*/, + 0/*initial_offset*/); Log(env_, options_.info_log, "Recovering log #%llu", (unsigned long long) log_number); @@ -364,6 +366,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, if (mem == NULL) { mem = new MemTable(internal_comparator_); + mem->Ref(); } status = WriteBatchInternal::InsertInto(&batch, mem); MaybeIgnoreError(&status); @@ -384,7 +387,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, // file-systems cause the DB::Open() to fail. break; } - delete mem; + mem->Unref(); mem = NULL; } } @@ -395,7 +398,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, // file-systems cause the DB::Open() to fail. } - delete mem; + if (mem != NULL) mem->Unref(); delete file; return status; } @@ -443,11 +446,12 @@ Status DBImpl::CompactMemTable() { // Replace immutable memtable with the generated Table if (s.ok()) { edit.SetPrevLogNumber(0); - s = versions_->LogAndApply(&edit, imm_); + s = versions_->LogAndApply(&edit); } if (s.ok()) { // Commit to the new state + imm_->Unref(); imm_ = NULL; has_imm_.Release_Store(NULL); DeleteObsoleteFiles(); @@ -556,7 +560,7 @@ void DBImpl::BackgroundCompaction() { c->edit()->DeleteFile(c->level(), f->number); c->edit()->AddFile(c->level() + 1, f->number, f->file_size, f->smallest, f->largest); - status = versions_->LogAndApply(c->edit(), NULL); + status = versions_->LogAndApply(c->edit()); Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", static_cast(f->number), c->level() + 1, @@ -697,7 +701,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { } compact->outputs.clear(); - Status s = versions_->LogAndApply(compact->compaction->edit(), NULL); + Status s = versions_->LogAndApply(compact->compaction->edit()); if (s.ok()) { compact->compaction->ReleaseInputs(); DeleteObsoleteFiles(); @@ -754,9 +758,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } Slice key = input->key(); - InternalKey tmp_internal_key; - tmp_internal_key.DecodeFrom(key); - if (compact->compaction->ShouldStopBefore(tmp_internal_key) && + if (compact->compaction->ShouldStopBefore(key) && compact->builder != NULL) { status = FinishCompactionOutputFile(compact, input); if (!status.ok()) { @@ -867,6 +869,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } compacting_ = false; compacting_cv_.SignalAll(); + VersionSet::LevelSummaryStorage tmp; + Log(env_, options_.info_log, + "compacted to: %s", versions_->LevelSummary(&tmp)); return status; } @@ -925,10 +930,11 @@ Status DBImpl::Get(const ReadOptions& options, Iterator* DBImpl::NewIterator(const ReadOptions& options) { SequenceNumber latest_snapshot; Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); - SequenceNumber sequence = - (options.snapshot ? options.snapshot->number_ : latest_snapshot); - return NewDBIterator(&dbname_, env_, - user_comparator(), internal_iter, sequence); + return NewDBIterator( + &dbname_, env_, user_comparator(), internal_iter, + (options.snapshot != NULL + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); } void DBImpl::Unref(void* arg1, void* arg2) { @@ -945,7 +951,7 @@ const Snapshot* DBImpl::GetSnapshot() { void DBImpl::ReleaseSnapshot(const Snapshot* s) { MutexLock l(&mutex_); - snapshots_.Delete(s); + snapshots_.Delete(reinterpret_cast(s)); } // Convenience methods @@ -985,12 +991,26 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { Status DBImpl::MakeRoomForWrite(bool force) { mutex_.AssertHeld(); + bool allow_delay = !force; Status s; while (true) { if (!bg_error_.ok()) { // Yield previous error s = bg_error_; break; + } else if ( + allow_delay && + versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) { + // We are getting close to hitting a hard limit on the number of + // L0 files. Rather than delaying a single write by several + // seconds when we hit the hard limit, start delaying each + // individual write by 1ms to reduce latency variance. Also, + // this delay hands over some CPU to the compaction thread in + // case it is sharing the same core as the writer. + mutex_.Unlock(); + env_->SleepForMicroseconds(1000); + allow_delay = false; // Do not delay a single write more than once + mutex_.Lock(); } else if (!force && (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { // There is room in current memtable @@ -999,6 +1019,9 @@ Status DBImpl::MakeRoomForWrite(bool force) { // We have filled up the current memtable, but the previous // one is still being compacted, so we wait. compacting_cv_.Wait(); + } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { + // There are too many level-0 files. + compacting_cv_.Wait(); } else { // Attempt to switch to a new memtable and trigger compaction of old assert(versions_->PrevLogNumber() == 0); @@ -1011,7 +1034,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { VersionEdit edit; edit.SetPrevLogNumber(versions_->LogNumber()); edit.SetLogNumber(new_log_number); - s = versions_->LogAndApply(&edit, NULL); + s = versions_->LogAndApply(&edit); if (!s.ok()) { delete lfile; env_->DeleteFile(LogFileName(dbname_, new_log_number)); @@ -1024,6 +1047,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { imm_ = mem_; has_imm_.Release_Store(imm_); mem_ = new MemTable(internal_comparator_); + mem_->Ref(); force = false; // Do not force another compaction if have room MaybeScheduleCompaction(); } @@ -1141,10 +1165,11 @@ Status DB::Open(const Options& options, const std::string& dbname, edit.SetLogNumber(new_log_number); impl->logfile_ = lfile; impl->log_ = new log::Writer(lfile); - s = impl->versions_->LogAndApply(&edit, NULL); + s = impl->versions_->LogAndApply(&edit); } if (s.ok()) { impl->DeleteObsoleteFiles(); + impl->MaybeScheduleCompaction(); } } impl->mutex_.Unlock(); @@ -1156,6 +1181,9 @@ Status DB::Open(const Options& options, const std::string& dbname, return s; } +Snapshot::~Snapshot() { +} + Status DestroyDB(const std::string& dbname, const Options& options) { Env* env = options.env; std::vector filenames; diff --git a/db/db_test.cc b/db/db_test.cc index f828e3d..06565b2 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3,7 +3,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "leveldb/db.h" - #include "db/db_impl.h" #include "db/filename.h" #include "db/version_set.h" @@ -802,8 +801,17 @@ TEST(DBTest, DBOpen_Options) { db = NULL; } +namespace { +typedef std::map KVMap; +} + class ModelDB: public DB { public: + class ModelSnapshot : public Snapshot { + public: + KVMap map_; + }; + explicit ModelDB(const Options& options): options_(options) { } ~ModelDB() { } virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { @@ -824,35 +832,34 @@ class ModelDB: public DB { return new ModelIter(saved, true); } else { const KVMap* snapshot_state = - reinterpret_cast(options.snapshot->number_); + &(reinterpret_cast(options.snapshot)->map_); return new ModelIter(snapshot_state, false); } } virtual const Snapshot* GetSnapshot() { - KVMap* saved = new KVMap; - *saved = map_; - return snapshots_.New( - reinterpret_cast(saved)); + ModelSnapshot* snapshot = new ModelSnapshot; + snapshot->map_ = map_; + return snapshot; } virtual void ReleaseSnapshot(const Snapshot* snapshot) { - const KVMap* saved = reinterpret_cast(snapshot->number_); - delete saved; - snapshots_.Delete(snapshot); + delete reinterpret_cast(snapshot); } virtual Status Write(const WriteOptions& options, WriteBatch* batch) { assert(options.post_write_snapshot == NULL); // Not supported - for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeValue: - map_[it.key().ToString()] = it.value().ToString(); - break; - case kTypeDeletion: - map_.erase(it.key().ToString()); - break; + class Handler : public WriteBatch::Handler { + public: + KVMap* map_; + virtual void Put(const Slice& key, const Slice& value) { + (*map_)[key.ToString()] = value.ToString(); } - } - return Status::OK(); + virtual void Delete(const Slice& key) { + map_->erase(key.ToString()); + } + }; + Handler handler; + handler.map_ = &map_; + return batch->Iterate(&handler); } virtual bool GetProperty(const Slice& property, std::string* value) { @@ -864,7 +871,6 @@ class ModelDB: public DB { } } private: - typedef std::map KVMap; class ModelIter: public Iterator { public: ModelIter(const KVMap* map, bool owned) @@ -897,7 +903,6 @@ class ModelDB: public DB { }; const Options options_; KVMap map_; - SnapshotList snapshots_; }; static std::string RandomKey(Random* rnd) { @@ -1023,8 +1028,70 @@ TEST(DBTest, Randomized) { if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); } +std::string MakeKey(unsigned int num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%016u", num); + return std::string(buf); +} + +void BM_LogAndApply(int iters, int num_base_files) { + std::string dbname = test::TmpDir() + "/leveldb_test_benchmark"; + DestroyDB(dbname, Options()); + + DB* db = NULL; + Options opts; + opts.create_if_missing = true; + Status s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + db = NULL; + + Env* env = Env::Default(); + + InternalKeyComparator cmp(BytewiseComparator()); + Options options; + VersionSet vset(dbname, &options, NULL, &cmp); + ASSERT_OK(vset.Recover()); + VersionEdit vbase; + uint64_t fnum = 1; + for (int i = 0; i < num_base_files; i++) { + InternalKey start(MakeKey(2*fnum), 1, kTypeValue); + InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); + vbase.AddFile(2, fnum++, 1 /* file size */, start, limit); + } + ASSERT_OK(vset.LogAndApply(&vbase)); + + uint64_t start_micros = env->NowMicros(); + + for (int i = 0; i < iters; i++) { + VersionEdit vedit; + vedit.DeleteFile(2, fnum); + InternalKey start(MakeKey(2*fnum), 1, kTypeValue); + InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); + vedit.AddFile(2, fnum++, 1 /* file size */, start, limit); + vset.LogAndApply(&vedit); + } + uint64_t stop_micros = env->NowMicros(); + unsigned int us = stop_micros - start_micros; + char buf[16]; + snprintf(buf, sizeof(buf), "%d", num_base_files); + fprintf(stderr, + "BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n", + buf, iters, us, ((float)us) / iters); +} + } int main(int argc, char** argv) { + if (argc > 1 && std::string(argv[1]) == "--benchmark") { + leveldb::BM_LogAndApply(1000, 1); + leveldb::BM_LogAndApply(1000, 100); + leveldb::BM_LogAndApply(1000, 10000); + leveldb::BM_LogAndApply(100, 100000); + return 0; + } + return leveldb::test::RunAllTests(); } diff --git a/db/dbformat.h b/db/dbformat.h index d583665..89c4afb 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -19,6 +19,16 @@ namespace leveldb { // parameters set via options. namespace config { static const int kNumLevels = 7; + +// Level-0 compaction is started when we hit this many files. +static const int kL0_CompactionTrigger = 4; + +// Soft limit on number of level-0 files. We slow down writes at this point. +static const int kL0_SlowdownWritesTrigger = 8; + +// Maximum number of level-0 files. We stop writes at this point. +static const int kL0_StopWritesTrigger = 12; + } class InternalKey; diff --git a/db/log_reader.cc b/db/log_reader.cc index 75e1d28..8721071 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -4,7 +4,6 @@ #include "db/log_reader.h" -#include #include "leveldb/env.h" #include "util/coding.h" #include "util/crc32c.h" @@ -15,46 +14,104 @@ namespace log { Reader::Reporter::~Reporter() { } -Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum) +Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum, + uint64_t initial_offset) : file_(file), reporter_(reporter), checksum_(checksum), backing_store_(new char[kBlockSize]), buffer_(), - eof_(false) { + eof_(false), + last_record_offset_(0), + end_of_buffer_offset_(0), + initial_offset_(initial_offset) { } Reader::~Reader() { delete[] backing_store_; } +bool Reader::SkipToInitialBlock() { + size_t offset_in_block = initial_offset_ % kBlockSize; + uint64_t block_start_location = initial_offset_ - offset_in_block; + + // Don't search a block if we'd be in the trailer + if (offset_in_block > kBlockSize - 6) { + offset_in_block = 0; + block_start_location += kBlockSize; + } + + end_of_buffer_offset_ = block_start_location; + + // Skip to start of first block that can contain the initial record + if (block_start_location > 0) { + Status skip_status = file_->Skip(block_start_location); + if (!skip_status.ok()) { + ReportDrop(block_start_location, skip_status); + return false; + } + } + + return true; +} + bool Reader::ReadRecord(Slice* record, std::string* scratch) { + if (last_record_offset_ < initial_offset_) { + if (!SkipToInitialBlock()) { + return false; + } + } + scratch->clear(); record->clear(); bool in_fragmented_record = false; + // Record offset of the logical record that we're reading + // 0 is a dummy value to make compilers happy + uint64_t prospective_record_offset = 0; Slice fragment; while (true) { + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); switch (ReadPhysicalRecord(&fragment)) { case kFullType: if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + if (scratch->empty()) { + in_fragmented_record = false; + } else { + ReportCorruption(scratch->size(), "partial record without end(1)"); + } } + prospective_record_offset = physical_record_offset; scratch->clear(); *record = fragment; + last_record_offset_ = prospective_record_offset; return true; case kFirstType: if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + if (scratch->empty()) { + in_fragmented_record = false; + } else { + ReportCorruption(scratch->size(), "partial record without end(2)"); + } } + prospective_record_offset = physical_record_offset; scratch->assign(fragment.data(), fragment.size()); in_fragmented_record = true; break; case kMiddleType: if (!in_fragmented_record) { - ReportDrop(fragment.size(), "missing start of fragmented record"); + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); } else { scratch->append(fragment.data(), fragment.size()); } @@ -62,31 +119,33 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) { case kLastType: if (!in_fragmented_record) { - ReportDrop(fragment.size(), "missing start of fragmented record"); + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); } else { scratch->append(fragment.data(), fragment.size()); *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; return true; } break; case kEof: if (in_fragmented_record) { - ReportDrop(scratch->size(), "partial record without end"); + ReportCorruption(scratch->size(), "partial record without end(3)"); scratch->clear(); } return false; case kBadRecord: if (in_fragmented_record) { - ReportDrop(scratch->size(), "error in middle of record"); + ReportCorruption(scratch->size(), "error in middle of record"); in_fragmented_record = false; scratch->clear(); } break; default: - ReportDrop( + ReportCorruption( (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), "unknown record type"); in_fragmented_record = false; @@ -97,9 +156,18 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) { return false; } -void Reader::ReportDrop(size_t bytes, const char* reason) { - if (reporter_ != NULL) { - reporter_->Corruption(bytes, Status::Corruption(reason)); +uint64_t Reader::LastRecordOffset() { + return last_record_offset_; +} + +void Reader::ReportCorruption(size_t bytes, const char* reason) { + ReportDrop(bytes, Status::Corruption(reason)); +} + +void Reader::ReportDrop(size_t bytes, const Status& reason) { + if (reporter_ != NULL && + end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) { + reporter_->Corruption(bytes, reason); } } @@ -110,11 +178,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { // Last read was a full read, so this is a trailer to skip buffer_.clear(); Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); if (!status.ok()) { - if (reporter_ != NULL) { - reporter_->Corruption(kBlockSize, status); - } buffer_.clear(); + ReportDrop(kBlockSize, status); eof_ = true; return kEof; } else if (buffer_.size() < kBlockSize) { @@ -125,8 +192,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { // End of file return kEof; } else { - ReportDrop(buffer_.size(), "truncated record at end of file"); + size_t drop_size = buffer_.size(); buffer_.clear(); + ReportCorruption(drop_size, "truncated record at end of file"); return kEof; } } @@ -138,8 +206,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { const unsigned int type = header[6]; const uint32_t length = a | (b << 8); if (kHeaderSize + length > buffer_.size()) { - ReportDrop(buffer_.size(), "bad record length"); + size_t drop_size = buffer_.size(); buffer_.clear(); + ReportCorruption(drop_size, "bad record length"); return kBadRecord; } @@ -160,13 +229,22 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { // been corrupted and if we trust it, we could find some // fragment of a real log record that just happens to look // like a valid log record. - ReportDrop(buffer_.size(), "checksum mismatch"); + size_t drop_size = buffer_.size(); buffer_.clear(); + ReportCorruption(drop_size, "checksum mismatch"); return kBadRecord; } } buffer_.remove_prefix(kHeaderSize + length); + + // Skip physical record that started before initial_offset_ + if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length < + initial_offset_) { + result->clear(); + return kBadRecord; + } + *result = Slice(header + kHeaderSize, length); return type; } diff --git a/db/log_reader.h b/db/log_reader.h index baf1475..61cc414 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -5,6 +5,8 @@ #ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ #define STORAGE_LEVELDB_DB_LOG_READER_H_ +#include + #include "db/log_format.h" #include "leveldb/slice.h" #include "leveldb/status.h" @@ -35,7 +37,11 @@ class Reader { // live while this Reader is in use. // // If "checksum" is true, verify checksums if available. - Reader(SequentialFile* file, Reporter* reporter, bool checksum); + // + // The Reader will start reading at the first record located at physical + // position >= initial_offset within the file. + Reader(SequentialFile* file, Reporter* reporter, bool checksum, + uint64_t initial_offset); ~Reader(); @@ -46,6 +52,11 @@ class Reader { // reader or the next mutation to *scratch. bool ReadRecord(Slice* record, std::string* scratch); + // Returns the physical offset of the last record returned by ReadRecord. + // + // Undefined before the first call to ReadRecord. + uint64_t LastRecordOffset(); + private: SequentialFile* const file_; Reporter* const reporter_; @@ -54,15 +65,37 @@ class Reader { Slice buffer_; bool eof_; // Last Read() indicated EOF by returning < kBlockSize + // Offset of the last record returned by ReadRecord. + uint64_t last_record_offset_; + // Offset of the first location past the end of buffer_. + uint64_t end_of_buffer_offset_; + + // Offset at which to start looking for the first record to return + uint64_t const initial_offset_; + // Extend record types with the following special values enum { kEof = kMaxRecordType + 1, + // Returned whenever we find an invalid physical record. + // Currently there are three situations in which this happens: + // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) + // * The record is a 0-length record (No drop is reported) + // * The record is below constructor's initial_offset (No drop is reported) kBadRecord = kMaxRecordType + 2 }; + // Skips all blocks that are completely before "initial_offset_". + // + // Returns true on success. Handles reporting. + bool SkipToInitialBlock(); + // Return type, or one of the preceding special values unsigned int ReadPhysicalRecord(Slice* result); - void ReportDrop(size_t bytes, const char* reason); + + // Reports dropped bytes to the reporter. + // buffer_ must be updated to remove the dropped bytes prior to invocation. + void ReportCorruption(size_t bytes, const char* reason); + void ReportDrop(size_t bytes, const Status& reason); // No copying allowed Reader(const Reader&); diff --git a/db/log_test.cc b/db/log_test.cc index 025a5ff..040bdff 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -60,7 +60,6 @@ class LogTest { virtual Status Read(size_t n, Slice* result, char* scratch) { ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; - ASSERT_EQ(kBlockSize, n); if (force_error_) { force_error_ = false; @@ -76,6 +75,17 @@ class LogTest { contents_.remove_prefix(n); return Status::OK(); } + + virtual Status Skip(size_t n) { + if (n > contents_.size()) { + contents_.clear(); + return Status::NotFound("in-memory file skipepd past end"); + } + + contents_.remove_prefix(n); + + return Status::OK(); + } }; class ReportCollector : public Reader::Reporter { @@ -97,10 +107,15 @@ class LogTest { Writer writer_; Reader reader_; + // Record metadata for testing initial offset functionality + static size_t initial_offset_record_sizes_[]; + static uint64_t initial_offset_last_record_offsets_[]; + public: LogTest() : reading_(false), writer_(&dest_), - reader_(&source_, &report_, true/*checksum*/) { + reader_(&source_, &report_, true/*checksum*/, + 0/*initial_offset*/) { } void Write(const std::string& msg) { @@ -153,6 +168,10 @@ class LogTest { return report_.dropped_bytes_; } + std::string ReportMessage() const { + return report_.message_; + } + // Returns OK iff recorded error message contains "msg" std::string MatchError(const std::string& msg) const { if (report_.message_.find(msg) == std::string::npos) { @@ -161,8 +180,61 @@ class LogTest { return "OK"; } } + + void WriteInitialOffsetLog() { + for (int i = 0; i < 4; i++) { + std::string record(initial_offset_record_sizes_[i], + static_cast('a' + i)); + Write(record); + } + } + + void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { + WriteInitialOffsetLog(); + reading_ = true; + source_.contents_ = Slice(dest_.contents_); + Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/, + WrittenBytes() + offset_past_end); + Slice record; + std::string scratch; + ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); + delete offset_reader; + } + + void CheckInitialOffsetRecord(uint64_t initial_offset, + int expected_record_offset) { + WriteInitialOffsetLog(); + reading_ = true; + source_.contents_ = Slice(dest_.contents_); + Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/, + initial_offset); + Slice record; + std::string scratch; + ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); + ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], + record.size()); + ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], + offset_reader->LastRecordOffset()); + ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); + delete offset_reader; + } + }; +size_t LogTest::initial_offset_record_sizes_[] = + {10000, // Two sizable records in first block + 10000, + 2 * log::kBlockSize - 1000, // Span three blocks + 1}; + +uint64_t LogTest::initial_offset_last_record_offsets_[] = + {0, + kHeaderSize + 10000, + 2 * (kHeaderSize + 10000), + 2 * (kHeaderSize + 10000) + + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; + + TEST(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } @@ -213,6 +285,19 @@ TEST(LogTest, MarginalTrailer) { ASSERT_EQ("EOF", Read()); } +TEST(LogTest, MarginalTrailer2) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + TEST(LogTest, ShortTrailer) { const int n = kBlockSize - 2*kHeaderSize + 4; Write(BigString("foo", n)); @@ -353,6 +438,60 @@ TEST(LogTest, ErrorJoinsRecords) { ASSERT_GE(dropped, 2*kBlockSize); } +TEST(LogTest, ReadStart) { + CheckInitialOffsetRecord(0, 0); +} + +TEST(LogTest, ReadSecondOneOff) { + CheckInitialOffsetRecord(1, 1); +} + +TEST(LogTest, ReadSecondTenThousand) { + CheckInitialOffsetRecord(10000, 1); +} + +TEST(LogTest, ReadSecondStart) { + CheckInitialOffsetRecord(10007, 1); +} + +TEST(LogTest, ReadThirdOneOff) { + CheckInitialOffsetRecord(10008, 2); +} + +TEST(LogTest, ReadThirdStart) { + CheckInitialOffsetRecord(20014, 2); +} + +TEST(LogTest, ReadFourthOneOff) { + CheckInitialOffsetRecord(20015, 3); +} + +TEST(LogTest, ReadFourthFirstBlockTrailer) { + CheckInitialOffsetRecord(log::kBlockSize - 4, 3); +} + +TEST(LogTest, ReadFourthMiddleBlock) { + CheckInitialOffsetRecord(log::kBlockSize + 1, 3); +} + +TEST(LogTest, ReadFourthLastBlock) { + CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3); +} + +TEST(LogTest, ReadFourthStart) { + CheckInitialOffsetRecord( + 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, + 3); +} + +TEST(LogTest, ReadEnd) { + CheckOffsetPastEndReturnsNoRecords(0); +} + +TEST(LogTest, ReadPastEnd) { + CheckOffsetPastEndReturnsNoRecords(5); +} + } } diff --git a/db/log_writer.cc b/db/log_writer.cc index 1696851..0887f6c 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -32,6 +32,7 @@ Status Writer::AddRecord(const Slice& slice) { // is empty, we still want to iterate once to emit a single // zero-length record Status s; + bool begin = true; do { const int leftover = kBlockSize - block_offset_; assert(leftover >= 0); @@ -52,7 +53,6 @@ Status Writer::AddRecord(const Slice& slice) { const size_t fragment_length = (left < avail) ? left : avail; RecordType type; - const bool begin = (ptr == slice.data()); const bool end = (left == fragment_length); if (begin && end) { type = kFullType; @@ -67,6 +67,7 @@ Status Writer::AddRecord(const Slice& slice) { s = EmitPhysicalRecord(type, ptr, fragment_length); ptr += fragment_length; left -= fragment_length; + begin = false; } while (s.ok() && left > 0); return s; } diff --git a/db/memtable.cc b/db/memtable.cc index a3b618a..9c25f6d 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -20,10 +20,12 @@ static Slice GetLengthPrefixedSlice(const char* data) { MemTable::MemTable(const InternalKeyComparator& cmp) : comparator_(cmp), + refs_(0), table_(comparator_, &arena_) { } MemTable::~MemTable() { + assert(refs_ == 0); } size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } @@ -48,10 +50,15 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator: public Iterator { public: - explicit MemTableIterator(MemTable::Table* table) { + explicit MemTableIterator(MemTable* mem, MemTable::Table* table) { + mem_ = mem; iter_ = new MemTable::Table::Iterator(table); + mem->Ref(); + } + virtual ~MemTableIterator() { + delete iter_; + mem_->Unref(); } - virtual ~MemTableIterator() { delete iter_; } virtual bool Valid() const { return iter_->Valid(); } virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } @@ -68,6 +75,7 @@ class MemTableIterator: public Iterator { virtual Status status() const { return Status::OK(); } private: + MemTable* mem_; MemTable::Table::Iterator* iter_; std::string tmp_; // For passing to EncodeKey @@ -77,7 +85,7 @@ class MemTableIterator: public Iterator { }; Iterator* MemTable::NewIterator() { - return new MemTableIterator(&table_); + return new MemTableIterator(this, &table_); } void MemTable::Add(SequenceNumber s, ValueType type, diff --git a/db/memtable.h b/db/memtable.h index 45b3342..2e9bd61 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -19,8 +19,21 @@ class MemTableIterator; class MemTable { public: + // MemTables are reference counted. The initial reference count + // is zero and the caller must call Ref() at least once. explicit MemTable(const InternalKeyComparator& comparator); - ~MemTable(); + + // Increase reference count. + void Ref() { ++refs_; } + + // Drop reference count. Delete if no more references exist. + void Unref() { + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + delete this; + } + } // Returns an estimate of the number of bytes of data in use by this // data structure. @@ -45,6 +58,8 @@ class MemTable { const Slice& value); private: + ~MemTable(); // Private since only Unref() should be used to delete it + struct KeyComparator { const InternalKeyComparator comparator; explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } @@ -56,6 +71,7 @@ class MemTable { typedef SkipList Table; KeyComparator comparator_; + int refs_; Arena arena_; Table table_; diff --git a/db/repair.cc b/db/repair.cc index c8e7b9e..4b57169 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -183,13 +183,15 @@ class Repairer { // corruptions cause entire commits to be skipped instead of // propagating bad information (like overly large sequence // numbers). - log::Reader reader(lfile, &reporter, false/*do not checksum*/); + log::Reader reader(lfile, &reporter, false/*do not checksum*/, + 0/*initial_offset*/); // Read all the records and add to a memtable std::string scratch; Slice record; WriteBatch batch; - MemTable mem(icmp_); + MemTable* mem = new MemTable(icmp_); + mem->Ref(); int counter = 0; while (reader.ReadRecord(&record, &scratch)) { if (record.size() < 12) { @@ -198,7 +200,7 @@ class Repairer { continue; } WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, &mem); + status = WriteBatchInternal::InsertInto(&batch, mem); if (status.ok()) { counter += WriteBatchInternal::Count(&batch); } else { @@ -215,10 +217,12 @@ class Repairer { VersionEdit skipped; FileMetaData meta; meta.number = next_file_number_++; - Iterator* iter = mem.NewIterator(); + Iterator* iter = mem->NewIterator(); status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, &skipped); delete iter; + mem->Unref(); + mem = NULL; if (status.ok()) { if (meta.file_size > 0) { table_numbers_.push_back(meta.number); diff --git a/db/snapshot.h b/db/snapshot.h index 9a90756..a08dbd3 100644 --- a/db/snapshot.h +++ b/db/snapshot.h @@ -12,17 +12,17 @@ namespace leveldb { class SnapshotList; // Snapshots are kept in a doubly-linked list in the DB. -// Each Snapshot corresponds to a particular sequence number. -class Snapshot { +// Each SnapshotImpl corresponds to a particular sequence number. +class SnapshotImpl : public Snapshot { public: SequenceNumber number_; // const after creation private: friend class SnapshotList; - // Snapshot is kept in a doubly-linked circular list - Snapshot* prev_; - Snapshot* next_; + // SnapshotImpl is kept in a doubly-linked circular list + SnapshotImpl* prev_; + SnapshotImpl* next_; SnapshotList* list_; // just for sanity checks }; @@ -35,11 +35,11 @@ class SnapshotList { } bool empty() const { return list_.next_ == &list_; } - Snapshot* oldest() const { assert(!empty()); return list_.next_; } - Snapshot* newest() const { assert(!empty()); return list_.prev_; } + SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } + SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } - const Snapshot* New(SequenceNumber seq) { - Snapshot* s = new Snapshot; + const SnapshotImpl* New(SequenceNumber seq) { + SnapshotImpl* s = new SnapshotImpl; s->number_ = seq; s->list_ = this; s->next_ = &list_; @@ -49,7 +49,7 @@ class SnapshotList { return s; } - void Delete(const Snapshot* s) { + void Delete(const SnapshotImpl* s) { assert(s->list_ == this); s->prev_->next_ = s->next_; s->next_->prev_ = s->prev_; @@ -58,7 +58,7 @@ class SnapshotList { private: // Dummy head of doubly-linked list of snapshots - Snapshot list_; + SnapshotImpl list_; }; } diff --git a/db/version_set.cc b/db/version_set.cc index c439f49..f64ac8d 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -57,17 +57,22 @@ std::string IntSetToString(const std::set& s) { Version::~Version() { assert(refs_ == 0); + + // Remove from linked list + prev_->next_ = next_; + next_->prev_ = prev_; + + // Drop references to files for (int level = 0; level < config::kNumLevels; level++) { for (size_t i = 0; i < files_[level].size(); i++) { FileMetaData* f = files_[level][i]; - assert(f->refs >= 0); + assert(f->refs > 0); f->refs--; if (f->refs <= 0) { delete f; } } } - delete cleanup_mem_; } // An internal iterator. For a given version/level pair, yields @@ -77,9 +82,9 @@ Version::~Version() { // encoded using EncodeFixed64. class Version::LevelFileNumIterator : public Iterator { public: - LevelFileNumIterator(const Version* version, + LevelFileNumIterator(const InternalKeyComparator& icmp, const std::vector* flist) - : icmp_(version->vset_->icmp_.user_comparator()), + : icmp_(icmp), flist_(flist), index_(flist->size()) { // Marks as invalid } @@ -157,7 +162,7 @@ static Iterator* GetFileIterator(void* arg, Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, int level) const { return NewTwoLevelIterator( - new LevelFileNumIterator(this, &files_[level]), + new LevelFileNumIterator(vset_->icmp_, &files_[level]), &GetFileIterator, vset_->table_cache_, options); } @@ -185,11 +190,11 @@ void Version::Ref() { } void Version::Unref() { + assert(this != &vset_->dummy_versions_); assert(refs_ >= 1); --refs_; if (refs_ == 0) { - vset_->MaybeDeleteOldVersions(); - // TODO: try to delete obsolete files + delete this; } } @@ -222,37 +227,58 @@ std::string Version::DebugString() const { // Versions that contain full copies of the intermediate state. class VersionSet::Builder { private: - typedef std::map FileMap; + // Helper to sort by v->files_[file_number].smallest + struct BySmallestKey { + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + int r = internal_comparator->Compare(f1->smallest, f2->smallest); + if (r != 0) { + return (r < 0); + } else { + // Break ties by file number + return (f1->number < f2->number); + } + } + }; + + typedef std::set FileSet; + struct LevelState { + std::set deleted_files; + FileSet* added_files; + }; + VersionSet* vset_; - FileMap files_[config::kNumLevels]; + Version* base_; + LevelState levels_[config::kNumLevels]; public: // Initialize a builder with the files from *base and other info from *vset Builder(VersionSet* vset, Version* base) - : vset_(vset) { + : vset_(vset), + base_(base) { + base_->Ref(); + BySmallestKey cmp; + cmp.internal_comparator = &vset_->icmp_; for (int level = 0; level < config::kNumLevels; level++) { - const std::vector& files = base->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - FileMetaData* f = files[i]; - f->refs++; - files_[level].insert(std::make_pair(f->number, f)); - } + levels_[level].added_files = new FileSet(cmp); } } ~Builder() { for (int level = 0; level < config::kNumLevels; level++) { - const FileMap& fmap = files_[level]; - for (FileMap::const_iterator iter = fmap.begin(); - iter != fmap.end(); - ++iter) { - FileMetaData* f = iter->second; + std::vector to_unref(levels_[level].added_files->begin(), + levels_[level].added_files->end()); + delete levels_[level].added_files; + for (int i = 0; i < to_unref.size(); i++) { + FileMetaData* f = to_unref[i]; f->refs--; if (f->refs <= 0) { delete f; } } } + base_->Unref(); } // Apply all of the edits in *edit to the current state. @@ -271,16 +297,7 @@ class VersionSet::Builder { ++iter) { const int level = iter->first; const uint64_t number = iter->second; - FileMap::iterator fiter = files_[level].find(number); - assert(fiter != files_[level].end()); // Sanity check for debug mode - if (fiter != files_[level].end()) { - FileMetaData* f = fiter->second; - f->refs--; - if (f->refs <= 0) { - delete f; - } - files_[level].erase(fiter); - } + levels_[level].deleted_files.insert(number); } // Add new files @@ -288,22 +305,66 @@ class VersionSet::Builder { const int level = edit->new_files_[i].first; FileMetaData* f = new FileMetaData(edit->new_files_[i].second); f->refs = 1; - assert(files_[level].count(f->number) == 0); - files_[level].insert(std::make_pair(f->number, f)); + levels_[level].deleted_files.erase(f->number); + levels_[level].added_files->insert(f); } } // Save the current state in *v. void SaveTo(Version* v) { + BySmallestKey cmp; + cmp.internal_comparator = &vset_->icmp_; for (int level = 0; level < config::kNumLevels; level++) { - const FileMap& fmap = files_[level]; - for (FileMap::const_iterator iter = fmap.begin(); - iter != fmap.end(); - ++iter) { - FileMetaData* f = iter->second; - f->refs++; - v->files_[level].push_back(f); + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *v. + const std::vector& base_files = base_->files_[level]; + std::vector::const_iterator base_iter = base_files.begin(); + std::vector::const_iterator base_end = base_files.end(); + const FileSet* added = levels_[level].added_files; + v->files_[level].reserve(base_files.size() + added->size()); + for (FileSet::const_iterator added_iter = added->begin(); + added_iter != added->end(); + ++added_iter) { + // Add all smaller files listed in base_ + for (std::vector::const_iterator bpos + = std::upper_bound(base_iter, base_end, *added_iter, cmp); + base_iter != bpos; + ++base_iter) { + MaybeAddFile(v, level, *base_iter); + } + + MaybeAddFile(v, level, *added_iter); + } + + // Add remaining base files + for (; base_iter != base_end; ++base_iter) { + MaybeAddFile(v, level, *base_iter); } + +#ifndef NDEBUG + // Make sure there is no overlap in levels > 0 + if (level > 0) { + for (int i = 1; i < v->files_[level].size(); i++) { + const InternalKey& prev_end = v->files_[level][i-1]->largest; + const InternalKey& this_begin = v->files_[level][i]->smallest; + if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) { + fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", + EscapeString(prev_end.Encode()).c_str(), + EscapeString(this_begin.Encode()).c_str()); + abort(); + } + } + } +#endif + } + } + + void MaybeAddFile(Version* v, int level, FileMetaData* f) { + if (levels_[level].deleted_files.count(f->number) > 0) { + // File is deleted: do nothing + } else { + f->refs++; + v->files_[level].push_back(f); } } }; @@ -324,22 +385,36 @@ VersionSet::VersionSet(const std::string& dbname, prev_log_number_(0), descriptor_file_(NULL), descriptor_log_(NULL), - current_(new Version(this)), - oldest_(current_) { + dummy_versions_(this), + current_(NULL) { + AppendVersion(new Version(this)); } VersionSet::~VersionSet() { - for (Version* v = oldest_; v != NULL; ) { - Version* next = v->next_; - assert(v->refs_ == 0); - delete v; - v = next; - } + current_->Unref(); + assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty delete descriptor_log_; delete descriptor_file_; } -Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { +void VersionSet::AppendVersion(Version* v) { + // Make "v" current + assert(v->refs_ == 0); + assert(v != current_); + if (current_ != NULL) { + current_->Unref(); + } + current_ = v; + v->Ref(); + + // Append to linked list + v->prev_ = dummy_versions_.prev_; + v->next_ = &dummy_versions_; + v->prev_->next_ = v; + v->next_->prev_ = v; +} + +Status VersionSet::LogAndApply(VersionEdit* edit) { if (edit->has_log_number_) { assert(edit->log_number_ >= log_number_); assert(edit->log_number_ < next_file_number_); @@ -360,22 +435,20 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { builder.Apply(edit); builder.SaveTo(v); } - - std::string new_manifest_file; - Status s = Finalize(v); + Finalize(v); // Initialize new descriptor log file if necessary by creating // a temporary file that contains a snapshot of the current version. - if (s.ok()) { - if (descriptor_log_ == NULL) { - assert(descriptor_file_ == NULL); - new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); - edit->SetNextFile(next_file_number_); - s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); - if (s.ok()) { - descriptor_log_ = new log::Writer(descriptor_file_); - s = WriteSnapshot(descriptor_log_); - } + std::string new_manifest_file; + Status s; + if (descriptor_log_ == NULL) { + assert(descriptor_file_ == NULL); + new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); + edit->SetNextFile(next_file_number_); + s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); + if (s.ok()) { + descriptor_log_ = new log::Writer(descriptor_file_); + s = WriteSnapshot(descriptor_log_); } } @@ -397,12 +470,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) { // Install the new version if (s.ok()) { - assert(current_->next_ == NULL); - assert(current_->cleanup_mem_ == NULL); - current_->cleanup_mem_ = cleanup_mem; - v->next_ = NULL; - current_->next_ = v; - current_ = v; + AppendVersion(v); log_number_ = edit->log_number_; prev_log_number_ = edit->prev_log_number_; } else { @@ -458,7 +526,7 @@ Status VersionSet::Recover() { { LogReporter reporter; reporter.status = &s; - log::Reader reader(file, &reporter, true/*checksum*/); + log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -518,20 +586,14 @@ Status VersionSet::Recover() { if (s.ok()) { Version* v = new Version(this); builder.SaveTo(v); - s = Finalize(v); - if (!s.ok()) { - delete v; - } else { - // Install recovered version - v->next_ = NULL; - current_->next_ = v; - current_ = v; - manifest_file_number_ = next_file; - next_file_number_ = next_file + 1; - last_sequence_ = last_sequence; - log_number_ = log_number; - prev_log_number_ = prev_log_number; - } + // Install recovered version + Finalize(v); + AppendVersion(v); + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + log_number_ = log_number; + prev_log_number_ = prev_log_number; } return s; @@ -545,15 +607,12 @@ static int64_t TotalFileSize(const std::vector& files) { return sum; } -Status VersionSet::Finalize(Version* v) { +void VersionSet::Finalize(Version* v) { // Precomputed best level for next compaction int best_level = -1; double best_score = -1; - Status s; - for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) { - s = SortLevel(v, level); - + for (int level = 0; level < config::kNumLevels-1; level++) { double score; if (level == 0) { // We treat level-0 specially by bounding the number of files @@ -567,7 +626,8 @@ Status VersionSet::Finalize(Version* v) { // file size is small (perhaps because of a small write-buffer // setting, or very high compression ratios, or lots of // overwrites/deletions). - score = v->files_[level].size() / 4.0; + score = v->files_[level].size() / + static_cast(config::kL0_CompactionTrigger); } else { // Compute the ratio of current size to size limit. const uint64_t level_bytes = TotalFileSize(v->files_[level]); @@ -582,7 +642,6 @@ Status VersionSet::Finalize(Version* v) { v->compaction_level_ = best_level; v->compaction_score_ = best_score; - return s; } Status VersionSet::WriteSnapshot(log::Writer* log) { @@ -615,44 +674,27 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { return log->AddRecord(record); } -// Helper to sort by tables_[file_number].smallest -struct VersionSet::BySmallestKey { - const InternalKeyComparator* internal_comparator; - - bool operator()(FileMetaData* f1, FileMetaData* f2) const { - return internal_comparator->Compare(f1->smallest, f2->smallest) < 0; - } -}; - -Status VersionSet::SortLevel(Version* v, uint64_t level) { - Status result; - BySmallestKey cmp; - cmp.internal_comparator = &icmp_; - std::sort(v->files_[level].begin(), v->files_[level].end(), cmp); - - if (result.ok() && level > 0) { - // There should be no overlap - for (size_t i = 1; i < v->files_[level].size(); i++) { - const InternalKey& prev_end = v->files_[level][i-1]->largest; - const InternalKey& this_begin = v->files_[level][i]->smallest; - if (icmp_.Compare(prev_end, this_begin) >= 0) { - result = Status::Corruption( - "overlapping ranges in same level", - (EscapeString(prev_end.Encode()) + " vs. " + - EscapeString(this_begin.Encode()))); - break; - } - } - } - return result; -} - int VersionSet::NumLevelFiles(int level) const { assert(level >= 0); assert(level < config::kNumLevels); return current_->files_[level].size(); } +const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const { + // Update code if kNumLevels changes + assert(config::kNumLevels == 7); + snprintf(scratch->buffer, sizeof(scratch->buffer), + "files[ %d %d %d %d %d %d %d ]", + int(current_->files_[0].size()), + int(current_->files_[1].size()), + int(current_->files_[2].size()), + int(current_->files_[3].size()), + int(current_->files_[4].size()), + int(current_->files_[5].size()), + int(current_->files_[6].size())); + return scratch->buffer; +} + uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t result = 0; for (int level = 0; level < config::kNumLevels; level++) { @@ -685,19 +727,10 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { return result; } -void VersionSet::MaybeDeleteOldVersions() { - // Note: it is important to delete versions in order since a newer - // version with zero refs may be holding a pointer to a memtable - // that is used by somebody who has a ref on an older version. - while (oldest_ != current_ && oldest_->refs_ == 0) { - Version* next = oldest_->next_; - delete oldest_; - oldest_ = next; - } -} - void VersionSet::AddLiveFiles(std::set* live) { - for (Version* v = oldest_; v != NULL; v = v->next_) { + for (Version* v = dummy_versions_.next_; + v != &dummy_versions_; + v = v->next_) { for (int level = 0; level < config::kNumLevels; level++) { const std::vector& files = v->files_[level]; for (size_t i = 0; i < files.size(); i++) { @@ -809,8 +842,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { } else { // Create concatenating iterator for the files from this level list[num++] = NewTwoLevelIterator( - new Version::LevelFileNumIterator( - c->input_version_, &c->inputs_[which]), + new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]), &GetFileIterator, table_cache_, options); } } @@ -996,11 +1028,12 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) { return true; } -bool Compaction::ShouldStopBefore(const InternalKey& key) { +bool Compaction::ShouldStopBefore(const Slice& internal_key) { // Scan to find earliest grandparent file that contains key. const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; while (grandparent_index_ < grandparents_.size() && - icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) { + icmp->Compare(internal_key, + grandparents_[grandparent_index_]->largest.Encode()) > 0) { if (seen_key_) { overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; } diff --git a/db/version_set.h b/db/version_set.h index e377513..2bac5e2 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -59,8 +59,8 @@ class Version { VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list + Version* prev_; // Previous version in linked list int refs_; // Number of live refs to this version - MemTable* cleanup_mem_; // NULL, or table to delete when version dropped // List of files per level std::vector files_[config::kNumLevels]; @@ -72,8 +72,7 @@ class Version { int compaction_level_; explicit Version(VersionSet* vset) - : vset_(vset), next_(NULL), refs_(0), - cleanup_mem_(NULL), + : vset_(vset), next_(this), prev_(this), refs_(0), compaction_score_(-1), compaction_level_(-1) { } @@ -95,10 +94,8 @@ class VersionSet { // Apply *edit to the current version to form a new descriptor that // is both saved to persistent state and installed as the new - // current version. Iff Apply() returns OK, arrange to delete - // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed - // by older versions. - Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem); + // current version. + Status LogAndApply(VersionEdit* edit); // Recover the last saved descriptor from persistent storage. Status Recover(); @@ -171,19 +168,20 @@ class VersionSet { // "key" as of version "v". uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[100]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + private: class Builder; friend class Compaction; friend class Version; - Status Finalize(Version* v); - - // Delete any old versions that are no longer needed. - void MaybeDeleteOldVersions(); - - struct BySmallestKey; - Status SortLevel(Version* v, uint64_t level); + void Finalize(Version* v); void GetOverlappingInputs( int level, @@ -202,6 +200,8 @@ class VersionSet { void SetupOtherInputs(Compaction* c); + void AppendVersion(Version* v); + Env* const env_; const std::string dbname_; const Options* const options_; @@ -216,10 +216,8 @@ class VersionSet { // Opened lazily WritableFile* descriptor_file_; log::Writer* descriptor_log_; - - // Versions are kept in a singly linked list that is never empty - Version* current_; // Pointer to the last (newest) list entry - Version* oldest_; // Pointer to the first (oldest) list entry + Version dummy_versions_; // Head of circular doubly-linked list of versions. + Version* current_; // == dummy_versions_.prev_ // Per-level key at which the next compaction at that level should start. // Either an empty string, or a valid InternalKey. @@ -265,8 +263,8 @@ class Compaction { bool IsBaseLevelForKey(const Slice& user_key); // Returns true iff we should stop building the current output - // before processing "key". - bool ShouldStopBefore(const InternalKey& key); + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key); // Release the input version for the compaction, once the compaction // is successful. diff --git a/db/write_batch.cc b/db/write_batch.cc index d561528..4e1e899 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -29,11 +29,53 @@ WriteBatch::WriteBatch() { WriteBatch::~WriteBatch() { } +WriteBatch::Handler::~Handler() { } + void WriteBatch::Clear() { rep_.clear(); rep_.resize(12); } +Status WriteBatch::Iterate(Handler* handler) const { + Slice input(rep_); + if (input.size() < 12) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + input.remove_prefix(12); + Slice key, value; + int found = 0; + while (!input.empty()) { + found++; + char tag = input[0]; + input.remove_prefix(1); + switch (tag) { + case kTypeValue: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + handler->Put(key, value); + } else { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input, &key)) { + handler->Delete(key); + } else { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (found != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + int WriteBatchInternal::Count(const WriteBatch* b) { return DecodeFixed32(b->rep_.data() + 8); } @@ -63,28 +105,29 @@ void WriteBatch::Delete(const Slice& key) { PutLengthPrefixedSlice(&rep_, key); } -Status WriteBatchInternal::InsertInto(const WriteBatch* b, - MemTable* memtable) { - const int count = WriteBatchInternal::Count(b); - int found = 0; - Iterator it(*b); - for (; !it.Done(); it.Next()) { - switch (it.op()) { - case kTypeDeletion: - memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice()); - break; - case kTypeValue: - memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value()); - break; - } - found++; +namespace { +class MemTableInserter : public WriteBatch::Handler { + public: + SequenceNumber sequence_; + MemTable* mem_; + + virtual void Put(const Slice& key, const Slice& value) { + mem_->Add(sequence_, kTypeValue, key, value); + sequence_++; } - if (!it.status().ok()) { - return it.status(); - } else if (found != count) { - return Status::Corruption("wrong count in WriteBatch"); + virtual void Delete(const Slice& key) { + mem_->Add(sequence_, kTypeDeletion, key, Slice()); + sequence_++; } - return Status::OK(); +}; +} + +Status WriteBatchInternal::InsertInto(const WriteBatch* b, + MemTable* memtable) { + MemTableInserter inserter; + inserter.sequence_ = WriteBatchInternal::Sequence(b); + inserter.mem_ = memtable; + return b->Iterate(&inserter); } void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { @@ -92,57 +135,4 @@ void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { b->rep_.assign(contents.data(), contents.size()); } -WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch) - : input_(WriteBatchInternal::Contents(&batch)), - done_(false) { - if (input_.size() < 12) { - done_ = true; - } else { - seq_ = WriteBatchInternal::Sequence(&batch), - input_.remove_prefix(12); - GetNextEntry(); - } -} - -void WriteBatchInternal::Iterator::Next() { - assert(!done_); - seq_++; - GetNextEntry(); -} - -void WriteBatchInternal::Iterator::GetNextEntry() { - if (input_.empty()) { - done_ = true; - return; - } - char tag = input_[0]; - input_.remove_prefix(1); - switch (tag) { - case kTypeValue: - if (GetLengthPrefixedSlice(&input_, &key_) && - GetLengthPrefixedSlice(&input_, &value_)) { - op_ = static_cast(tag); - } else { - status_ = Status::Corruption("bad WriteBatch Put"); - done_ = true; - input_.clear(); - } - break; - case kTypeDeletion: - if (GetLengthPrefixedSlice(&input_, &key_)) { - op_ = kTypeDeletion; - } else { - status_ = Status::Corruption("bad WriteBatch Delete"); - done_ = true; - input_.clear(); - } - break; - default: - status_ = Status::Corruption("unknown WriteBatch tag"); - done_ = true; - input_.clear(); - break; - } -} - } diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index ab0a823..d975444 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -37,30 +37,6 @@ class WriteBatchInternal { static void SetContents(WriteBatch* batch, const Slice& contents); static Status InsertInto(const WriteBatch* batch, MemTable* memtable); - - // Iterate over the contents of a write batch. - class Iterator { - public: - explicit Iterator(const WriteBatch& batch); - bool Done() const { return done_; } - void Next(); - ValueType op() const { return op_; } - const Slice& key() const { return key_; } - const Slice& value() const { return value_; } - SequenceNumber sequence_number() const { return seq_; } - Status status() const { return status_; } - - private: - void GetNextEntry(); - - Slice input_; - bool done_; - ValueType op_; - Slice key_; - Slice value_; - SequenceNumber seq_; - Status status_; - }; }; } diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 2bf1134..73d68fd 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -14,10 +14,11 @@ namespace leveldb { static std::string PrintContents(WriteBatch* b) { InternalKeyComparator cmp(BytewiseComparator()); - MemTable mem(cmp); + MemTable* mem = new MemTable(cmp); + mem->Ref(); std::string state; - Status s = WriteBatchInternal::InsertInto(b, &mem); - Iterator* iter = mem.NewIterator(); + Status s = WriteBatchInternal::InsertInto(b, mem); + Iterator* iter = mem->NewIterator(); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); @@ -42,6 +43,7 @@ static std::string PrintContents(WriteBatch* b) { if (!s.ok()) { state.append("ParseError()"); } + mem->Unref(); return state; } diff --git a/doc/impl.html b/doc/impl.html index dd09fea..e870795 100644 --- a/doc/impl.html +++ b/doc/impl.html @@ -17,14 +17,14 @@ However the organization of the files that make up the representation is somewhat different and is explained below.

-Each database is represented by a set of file stored in a directory. +Each database is represented by a set of files stored in a directory. There are several different types of files as documented below:

Log files

A log file (*.log) stores a sequence of recent updates. Each update is appended to the current log file. When the log file reaches a -pre-determined size (approximately 1MB by default), it is converted +pre-determined size (approximately 4MB by default), it is converted to a sorted table (see below) and a new log file is created for future updates.

@@ -83,19 +83,15 @@ Other files used for miscellaneous purposes may also be present

Level 0

When the log file grows above a certain size (1MB by default):
    -
  • Write the contents of the current memtable to an sstable -
  • Replace the current memtable by a brand new empty memtable -
  • Switch to a new log file +
  • Create a brand new memtable and log file and direct future updates here +
  • In the background: +
      +
    • Write the contents of the previous memtable to an sstable +
    • Discard the memtable
    • Delete the old log file and the old memtable +
    • Add the new sstable to the young (level-0) level. +
-Experimental measurements show that generating an sstable from a 1MB -log file takes ~12ms, which seems like an acceptable latency hiccup to -add infrequently to a log write. - -

-The new sstable is added to a special level-0 level. level-0 contains -a set of files (up to 4 by default). However unlike other levels, -these files do not cover disjoint ranges, but may overlap each other.

Compactions

@@ -162,8 +158,8 @@ read.

Solution 1: To reduce this problem, we might want to increase the log switching threshold when the number of level-0 files is large. Though -the downside is that the larger this threshold, the larger the delay -that we will add to write latency when a write triggers a log switch. +the downside is that the larger this threshold, the more memory we will +need to hold the corresponding memtable.

Solution 2: We might want to decrease write rate artificially when the diff --git a/doc/index.html b/doc/index.html index c2312b7..58442e8 100644 --- a/doc/index.html +++ b/doc/index.html @@ -141,10 +141,18 @@ the batch.

Concurrency

-A database may only be opened by one process at a time. The leveldb -implementation acquires a lock from the operating system to prevent -misuse. Within a single process, the same leveldb::DB object may -be safely used by multiple concurrent threads. +A database may only be opened by one process at a time. +The leveldb implementation acquires a lock from the +operating system to prevent misuse. Within a single process, the +same leveldb::DB object may be safely shared by multiple +concurrent threads. I.e., different threads may write into or fetch +iterators or call Get on the same database without any +external synchronization (the leveldb implementation will +automatically do the required synchronization). However other objects +(like Iterator and WriteBatch) may require external synchronization. +If two threads share such an object, they must protect access to it +using their own locking protocol. More details are available in +the public header files.

Iteration

diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h index 4e00e4d..c215fac 100644 --- a/include/leveldb/comparator.h +++ b/include/leveldb/comparator.h @@ -12,7 +12,9 @@ namespace leveldb { class Slice; // A Comparator object provides a total order across slices that are -// used as keys in an sstable or a database. +// used as keys in an sstable or a database. A Comparator implementation +// must be thread-safe since leveldb may invoke its methods concurrently +// from multiple threads. class Comparator { public: virtual ~Comparator(); diff --git a/include/leveldb/db.h b/include/leveldb/db.h index f18ded3..79bd283 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -13,26 +13,32 @@ namespace leveldb { static const int kMajorVersion = 1; -static const int kMinorVersion = 1; +static const int kMinorVersion = 2; struct Options; struct ReadOptions; struct WriteOptions; - -class Snapshot; class WriteBatch; -// Some internal types. Clients should ignore. -class WriteBatchInternal; +// Abstract handle to particular state of a DB. +// A Snapshot is an immutable object and can therefore be safely +// accessed from multiple threads without any external synchronization. +class Snapshot { + protected: + virtual ~Snapshot(); +}; +// A range of keys struct Range { - Slice start; - Slice limit; + Slice start; // Included in the range + Slice limit; // Not included in the range Range(const Slice& s, const Slice& l) : start(s), limit(l) { } }; // A DB is a persistent ordered map from keys to values. +// A DB is safe for concurrent access from multiple threads without +// any external synchronization. class DB { public: // Open the database with the specified "name". diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 4b6e712..39f6a1a 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -6,6 +6,9 @@ // operating system functionality like the filesystem etc. Callers // may wish to provide a custom Env object when opening a database to // get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ #define STORAGE_LEVELDB_INCLUDE_ENV_H_ @@ -160,6 +163,15 @@ class SequentialFile { // // REQUIRES: External synchronization virtual Status Read(size_t n, Slice* result, char* scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; }; // A file abstraction for randomly reading the contents of a file. diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h index 1866fb5..6821d85 100644 --- a/include/leveldb/iterator.h +++ b/include/leveldb/iterator.h @@ -6,6 +6,11 @@ // The following class defines the interface. Multiple implementations // are provided by this library. In particular, iterators are provided // to access the contents of a Table or a DB. +// +// Multiple threads can invoke const methods on an Iterator without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Iterator must use +// external synchronization. #ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ #define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h index 62cb894..3c000b8 100644 --- a/include/leveldb/slice.h +++ b/include/leveldb/slice.h @@ -6,6 +6,11 @@ // storage and a size. The user of a Slice must ensure that the slice // is not used after the corresponding external storage has been // deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. #ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ #define STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/include/leveldb/status.h b/include/leveldb/status.h index 47e3edf..6796fdd 100644 --- a/include/leveldb/status.h +++ b/include/leveldb/status.h @@ -4,12 +4,16 @@ // // A Status encapsulates the result of an operation. It may indicate success, // or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. #ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ #define STORAGE_LEVELDB_INCLUDE_STATUS_H_ #include -#include #include "leveldb/slice.h" namespace leveldb { @@ -18,7 +22,7 @@ class Status { public: // Create a success status. Status() : state_(NULL) { } - ~Status() { delete state_; } + ~Status() { delete[] state_; } // Copy the specified status. Status(const Status& s); @@ -29,7 +33,7 @@ class Status { // Return error status of an appropriate type. static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { - return Status(kNotFound, msg, Slice()); + return Status(kNotFound, msg, msg2); } static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kCorruption, msg, msg2); @@ -55,6 +59,13 @@ class Status { std::string ToString() const; private: + // OK status has a NULL state_. Otherwise, state_ is a new[] array + // of the following form: + // state_[0..3] == length of message + // state_[4] == code + // state_[5..] == message + const char* state_; + enum Code { kOk = 0, kNotFound = 1, @@ -63,21 +74,24 @@ class Status { kInvalidArgument = 4, kIOError = 5, }; - Code code() const { return (state_ == NULL) ? kOk : state_->first; } - Status(Code code, const Slice& msg, const Slice& msg2); + Code code() const { + return (state_ == NULL) ? kOk : static_cast(state_[4]); + } - typedef std::pair State; - State* state_; + Status(Code code, const Slice& msg, const Slice& msg2); + static const char* CopyState(const char* s); }; inline Status::Status(const Status& s) { - state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); } inline void Status::operator=(const Status& s) { - if (this != &s) { - delete state_; - state_ = (s.state_ == NULL) ? NULL : new State(*s.state_); + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); } } diff --git a/include/leveldb/table.h b/include/leveldb/table.h index bd99176..35e5d22 100644 --- a/include/leveldb/table.h +++ b/include/leveldb/table.h @@ -17,7 +17,8 @@ class RandomAccessFile; struct ReadOptions; // A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. class Table { public: // Attempt to open the table that is stored in bytes [0..file_size) diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h index 49d2d51..23851de 100644 --- a/include/leveldb/table_builder.h +++ b/include/leveldb/table_builder.h @@ -4,6 +4,11 @@ // // TableBuilder provides the interface used to build a Table // (an immutable and sorted map from keys to values). +// +// Multiple threads can invoke const methods on a TableBuilder without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same TableBuilder must use +// external synchronization. #ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ #define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h index 3411952..b4446c2 100644 --- a/include/leveldb/write_batch.h +++ b/include/leveldb/write_batch.h @@ -12,11 +12,17 @@ // batch.Delete("key"); // batch.Put("key", "v2"); // batch.Put("key", "v3"); +// +// Multiple threads can invoke const methods on a WriteBatch without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same WriteBatch must use +// external synchronization. #ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ #define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ #include +#include "leveldb/status.h" namespace leveldb { @@ -36,6 +42,15 @@ class WriteBatch { // Clear all updates buffered in this batch. void Clear(); + // Support for iterating over the contents of a batch. + class Handler { + public: + virtual ~Handler(); + virtual void Put(const Slice& key, const Slice& value) = 0; + virtual void Delete(const Slice& key) = 0; + }; + Status Iterate(Handler* handler) const; + private: friend class WriteBatchInternal; diff --git a/table/block_builder.cc b/table/block_builder.cc index dc958c8..d2ffa21 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -80,7 +80,7 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) { if (counter_ < options_->block_restart_interval) { // See how much sharing to do with previous string const size_t min_length = std::min(last_key_piece.size(), key.size()); - while ((shared < min_length) && (last_key_[shared] == key[shared])) { + while ((shared < min_length) && (last_key_piece[shared] == key[shared])) { shared++; } } else { diff --git a/table/table_test.cc b/table/table_test.cc index 4b3e85e..cf2bae0 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -319,13 +319,15 @@ class MemTableConstructor: public Constructor { : Constructor(cmp), internal_comparator_(cmp) { memtable_ = new MemTable(internal_comparator_); + memtable_->Ref(); } ~MemTableConstructor() { - delete memtable_; + memtable_->Unref(); } virtual Status FinishImpl(const Options& options, const KVMap& data) { - delete memtable_; + memtable_->Unref(); memtable_ = new MemTable(internal_comparator_); + memtable_->Ref(); int seq = 1; for (KVMap::const_iterator it = data.begin(); it != data.end(); @@ -736,16 +738,17 @@ class MemTableTest { }; TEST(MemTableTest, Simple) { InternalKeyComparator cmp(BytewiseComparator()); - MemTable memtable(cmp); + MemTable* memtable = new MemTable(cmp); + memtable->Ref(); WriteBatch batch; WriteBatchInternal::SetSequence(&batch, 100); batch.Put(std::string("k1"), std::string("v1")); batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("largekey"), std::string("vlarge")); - ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok()); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable).ok()); - Iterator* iter = memtable.NewIterator(); + Iterator* iter = memtable->NewIterator(); iter->SeekToFirst(); while (iter->Valid()) { fprintf(stderr, "key: '%s' -> '%s'\n", @@ -755,6 +758,7 @@ TEST(MemTableTest, Simple) { } delete iter; + memtable->Unref(); } static bool Between(uint64_t val, uint64_t low, uint64_t high) { diff --git a/util/env_chromium.cc b/util/env_chromium.cc index fd3a4c7..1af525a 100644 --- a/util/env_chromium.cc +++ b/util/env_chromium.cc @@ -141,6 +141,13 @@ class ChromiumSequentialFile: public SequentialFile { } return s; } + + virtual Status Skip(uint64_t n) { + if (fseek(file_, n, SEEK_CUR)) { + return Status::IOError(filename_, strerror(errno)); + } + return Status::OK(); + } }; class ChromiumRandomAccessFile: public RandomAccessFile { diff --git a/util/env_posix.cc b/util/env_posix.cc index 5cddb0c..fec1599 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -52,6 +52,13 @@ class PosixSequentialFile: public SequentialFile { } return s; } + + virtual Status Skip(uint64_t n) { + if (fseek(file_, n, SEEK_CUR)) { + return Status::IOError(filename_, strerror(errno)); + } + return Status::OK(); + } }; class PosixRandomAccessFile: public RandomAccessFile { diff --git a/util/status.cc b/util/status.cc index d9b7195..02051a9 100644 --- a/util/status.cc +++ b/util/status.cc @@ -8,13 +8,29 @@ namespace leveldb { +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 5]; + memcpy(result, state, size + 5); + return result; +} + Status::Status(Code code, const Slice& msg, const Slice& msg2) { assert(code != kOk); - state_ = new State(make_pair(code, std::string(msg.data(), msg.size()))); - if (!msg2.empty()) { - state_->second.append(": "); - state_->second.append(msg2.data(), msg2.size()); + const uint32_t len1 = msg.size(); + const uint32_t len2 = msg2.size(); + const uint32_t size = len1 + (len2 ? (2 + len2) : 0); + char* result = new char[size + 5]; + memcpy(result, &size, sizeof(size)); + result[4] = static_cast(code); + memcpy(result + 5, msg.data(), len1); + if (len2) { + result[5 + len1] = ':'; + result[6 + len1] = ' '; + memcpy(result + 7 + len1, msg2.data(), len2); } + state_ = result; } std::string Status::ToString() const { @@ -23,12 +39,12 @@ std::string Status::ToString() const { } else { char tmp[30]; const char* type; - switch (state_->first) { + switch (code()) { case kOk: type = "OK"; break; case kNotFound: - type = "NotFound"; + type = "NotFound: "; break; case kCorruption: type = "Corruption: "; @@ -44,14 +60,14 @@ std::string Status::ToString() const { break; default: snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", - static_cast(state_->first)); + static_cast(code())); type = tmp; break; } std::string result(type); - if (!state_->second.empty()) { - result.append(state_->second); - } + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(state_ + 5, length); return result; } } -- cgit v1.2.3 From c6ac22e779e5135e494ddeb1d8e2b6008e9b619e Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Sat, 28 May 2011 00:53:58 +0000 Subject: Update from upstream @21551990 * Patch LevelDB to build for OSX and iOS * Fix race condition in memtable iterator deletion. * Other small fixes. git-svn-id: http://leveldb.googlecode.com/svn/trunk@29 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 45 +++++++++++++++-- db/db_bench.cc | 20 ++++++++ db/db_impl.cc | 36 +++++++++++--- db/db_impl.h | 4 -- db/db_test.cc | 95 +++++++++++++++++++++++++++++++++++ db/log_test.cc | 2 +- db/memtable.cc | 33 +++++-------- port/port.h | 2 + port/port_osx.cc | 50 +++++++++++++++++++ port/port_osx.h | 125 +++++++++++++++++++++++++++++++++++++++++++++++ table/iterator.cc | 1 - table/iterator_wrapper.h | 11 ++--- util/cache.cc | 2 + 13 files changed, 381 insertions(+), 45 deletions(-) create mode 100644 port/port_osx.cc create mode 100644 port/port_osx.h diff --git a/Makefile b/Makefile index 43ac23d..5eadd72 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,21 @@ CC = g++ #OPT = -O2 -DNDEBUG OPT = -g2 -CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -I./include -std=c++0x $(OPT) +UNAME := $(shell uname) + +ifeq ($(UNAME), Darwin) +# To build for iOS, set PLATFORM=IOS. +ifndef PLATFORM +PLATFORM=OSX +endif # PLATFORM +PLATFORM_CFLAGS = -DLEVELDB_PLATFORM_OSX +PORT_MODULE = port_osx.o +else # UNAME +PLATFORM_CFLAGS = -DLEVELDB_PLATFORM_POSIX -std=c++0x +PORT_MODULE = port_posix.o +endif # UNAME + +CFLAGS = -c -I. -I./include $(PLATFORM_CFLAGS) $(OPT) LDFLAGS=-lpthread @@ -26,7 +40,7 @@ LIBOBJECTS = \ ./db/version_edit.o \ ./db/version_set.o \ ./db/write_batch.o \ - ./port/port_posix.o \ + ./port/$(PORT_MODULE) \ ./table/block.o \ ./table/block_builder.o \ ./table/format.o \ @@ -69,13 +83,25 @@ TESTS = \ PROGRAMS = db_bench $(TESTS) -all: $(PROGRAMS) +LIBRARY = libleveldb.a + +ifeq ($(PLATFORM), IOS) +# Only XCode can build executable applications for iOS. +all: $(LIBRARY) +else +all: $(PROGRAMS) $(LIBRARY) +endif check: $(TESTS) for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done clean: - rm -f $(PROGRAMS) */*.o + -rm -f $(PROGRAMS) $(LIBRARY) */*.o ios-x86/*/*.o ios-arm/*/*.o + -rmdir -p ios-x86/* ios-arm/* + +$(LIBRARY): $(LIBOBJECTS) + rm -f $@ + $(AR) -rs $@ $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ @@ -122,8 +148,19 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ +ifeq ($(PLATFORM), IOS) +# For iOS, create universal object files to be used on both the simulator and +# a device. +.cc.o: + mkdir -p ios-x86/$(dir $@) + $(CC) $(CFLAGS) -isysroot /Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator4.3.sdk -arch i686 $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + $(CC) $(CFLAGS) -isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.3.sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ +else .cc.o: $(CC) $(CFLAGS) $< -o $@ +endif # TODO(gabor): dependencies for .o files # TODO(gabor): Build library diff --git a/db/db_bench.cc b/db/db_bench.cc index b5fd679..b24179d 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -29,6 +29,7 @@ // readrandom -- read N times in random order // readhot -- read N times in random order from 1% section of DB // crc32c -- repeated crc32c of 4K of data +// acquireload -- load N*1000 times // Meta operations: // compact -- Compact the entire DB // stats -- Print DB stats @@ -50,6 +51,7 @@ static const char* FLAGS_benchmarks = "crc32c," "snappycomp," "snappyuncomp," + "acquireload," ; // Number of key/values to place in database @@ -382,6 +384,8 @@ class Benchmark { Compact(); } else if (name == Slice("crc32c")) { Crc32c(4096, "(4K per op)"); + } else if (name == Slice("acquireload")) { + AcquireLoad(); } else if (name == Slice("snappycomp")) { SnappyCompress(); } else if (name == Slice("snappyuncomp")) { @@ -420,6 +424,22 @@ class Benchmark { message_ = label; } + void AcquireLoad() { + int dummy; + port::AtomicPointer ap(&dummy); + int count = 0; + void *ptr = NULL; + message_ = "(each op is 1000 loads)"; + while (count < 100000) { + for (int i = 0; i < 1000; i++) { + ptr = ap.Acquire_Load(); + } + count++; + FinishedSingleOp(); + } + if (ptr == NULL) exit(1); // Disable unused variable warning. + } + void SnappyCompress() { Slice input = gen_.Generate(Options().block_size); int64_t bytes = 0; diff --git a/db/db_impl.cc b/db/db_impl.cc index baf9299..9b139ce 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -875,22 +875,49 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { return status; } +namespace { +struct IterState { + port::Mutex* mu; + Version* version; + MemTable* mem; + MemTable* imm; +}; + +static void CleanupIteratorState(void* arg1, void* arg2) { + IterState* state = reinterpret_cast(arg1); + state->mu->Lock(); + state->mem->Unref(); + if (state->imm != NULL) state->imm->Unref(); + state->version->Unref(); + state->mu->Unlock(); + delete state; +} +} + Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, SequenceNumber* latest_snapshot) { + IterState* cleanup = new IterState; mutex_.Lock(); *latest_snapshot = versions_->LastSequence(); // Collect together all needed child iterators std::vector list; list.push_back(mem_->NewIterator()); + mem_->Ref(); if (imm_ != NULL) { list.push_back(imm_->NewIterator()); + imm_->Ref(); } versions_->current()->AddIterators(options, &list); Iterator* internal_iter = NewMergingIterator(&internal_comparator_, &list[0], list.size()); versions_->current()->Ref(); - internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current()); + + cleanup->mu = &mutex_; + cleanup->mem = mem_; + cleanup->imm = imm_; + cleanup->version = versions_->current(); + internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL); mutex_.Unlock(); return internal_iter; @@ -937,13 +964,6 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options) { : latest_snapshot)); } -void DBImpl::Unref(void* arg1, void* arg2) { - DBImpl* impl = reinterpret_cast(arg1); - Version* v = reinterpret_cast(arg2); - MutexLock l(&impl->mutex_); - v->Unref(); -} - const Snapshot* DBImpl::GetSnapshot() { MutexLock l(&mutex_); return snapshots_.New(versions_->LastSequence()); diff --git a/db/db_impl.h b/db/db_impl.h index 7699d8c..c23ae00 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -77,10 +77,6 @@ class DBImpl : public DB { // Delete any unneeded files and stale in-memory entries. void DeleteObsoleteFiles(); - // Called when an iterator over a particular version of the - // descriptor goes away. - static void Unref(void* arg1, void* arg2); - // Compact the in-memory write buffer to disk. Switches to a new // log-file/memtable and writes a new descriptor iff successful. Status CompactMemTable(); diff --git a/db/db_test.cc b/db/db_test.cc index 06565b2..42e70cf 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -801,6 +801,101 @@ TEST(DBTest, DBOpen_Options) { db = NULL; } +// Multi-threaded test: +namespace { + +static const int kNumThreads = 4; +static const int kTestSeconds = 10; +static const int kNumKeys = 1000; + +struct MTState { + DBTest* test; + port::AtomicPointer stop; + port::AtomicPointer counter[kNumThreads]; + port::AtomicPointer thread_done[kNumThreads]; +}; + +struct MTThread { + MTState* state; + int id; +}; + +static void MTThreadBody(void* arg) { + MTThread* t = reinterpret_cast(arg); + DB* db = t->state->test->db_; + uintptr_t counter = 0; + fprintf(stderr, "... starting thread %d\n", t->id); + Random rnd(1000 + t->id); + std::string value; + char valbuf[1500]; + while (t->state->stop.Acquire_Load() == NULL) { + t->state->counter[t->id].Release_Store(reinterpret_cast(counter)); + + int key = rnd.Uniform(kNumKeys); + char keybuf[20]; + snprintf(keybuf, sizeof(keybuf), "%016d", key); + + if (rnd.OneIn(2)) { + // Write values of the form . + // We add some padding for force compactions. + snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d", + key, t->id, static_cast(counter)); + ASSERT_OK(db->Put(WriteOptions(), Slice(keybuf), Slice(valbuf))); + } else { + // Read a value and verify that it matches the pattern written above. + Status s = db->Get(ReadOptions(), Slice(keybuf), &value); + if (s.IsNotFound()) { + // Key has not yet been written + } else { + // Check that the writer thread counter is >= the counter in the value + ASSERT_OK(s); + int k, w, c; + ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE(c, reinterpret_cast( + t->state->counter[w].Acquire_Load())); + } + } + counter++; + } + t->state->thread_done[t->id].Release_Store(t); + fprintf(stderr, "... stopping thread %d after %d ops\n", t->id, int(counter)); +} + +} + +TEST(DBTest, MultiThreaded) { + // Initialize state + MTState mt; + mt.test = this; + mt.stop.Release_Store(0); + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].Release_Store(0); + mt.thread_done[id].Release_Store(0); + } + + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + env_->StartThread(MTThreadBody, &thread[id]); + } + + // Let them run for a while + env_->SleepForMicroseconds(kTestSeconds * 1000000); + + // Stop the threads and wait for them to finish + mt.stop.Release_Store(&mt); + for (int id = 0; id < kNumThreads; id++) { + while (mt.thread_done[id].Acquire_Load() == NULL) { + env_->SleepForMicroseconds(100000); + } + } +} + namespace { typedef std::map KVMap; } diff --git a/db/log_test.cc b/db/log_test.cc index 040bdff..06e0893 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -76,7 +76,7 @@ class LogTest { return Status::OK(); } - virtual Status Skip(size_t n) { + virtual Status Skip(uint64_t n) { if (n > contents_.size()) { contents_.clear(); return Status::NotFound("in-memory file skipepd past end"); diff --git a/db/memtable.cc b/db/memtable.cc index 9c25f6d..687900a 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -50,33 +50,24 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator: public Iterator { public: - explicit MemTableIterator(MemTable* mem, MemTable::Table* table) { - mem_ = mem; - iter_ = new MemTable::Table::Iterator(table); - mem->Ref(); - } - virtual ~MemTableIterator() { - delete iter_; - mem_->Unref(); - } - - virtual bool Valid() const { return iter_->Valid(); } - virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void SeekToLast() { iter_->SeekToLast(); } - virtual void Next() { iter_->Next(); } - virtual void Prev() { iter_->Prev(); } - virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); } + explicit MemTableIterator(MemTable::Table* table) : iter_(table) { } + + virtual bool Valid() const { return iter_.Valid(); } + virtual void Seek(const Slice& k) { iter_.Seek(EncodeKey(&tmp_, k)); } + virtual void SeekToFirst() { iter_.SeekToFirst(); } + virtual void SeekToLast() { iter_.SeekToLast(); } + virtual void Next() { iter_.Next(); } + virtual void Prev() { iter_.Prev(); } + virtual Slice key() const { return GetLengthPrefixedSlice(iter_.key()); } virtual Slice value() const { - Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + Slice key_slice = GetLengthPrefixedSlice(iter_.key()); return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); } virtual Status status() const { return Status::OK(); } private: - MemTable* mem_; - MemTable::Table::Iterator* iter_; + MemTable::Table::Iterator iter_; std::string tmp_; // For passing to EncodeKey // No copying allowed @@ -85,7 +76,7 @@ class MemTableIterator: public Iterator { }; Iterator* MemTable::NewIterator() { - return new MemTableIterator(this, &table_); + return new MemTableIterator(&table_); } void MemTable::Add(SequenceNumber s, ValueType type, diff --git a/port/port.h b/port/port.h index 816826b..e35db23 100644 --- a/port/port.h +++ b/port/port.h @@ -16,6 +16,8 @@ # include "port/port_chromium.h" #elif defined(LEVELDB_PLATFORM_ANDROID) # include "port/port_android.h" +#elif defined(LEVELDB_PLATFORM_OSX) +# include "port/port_osx.h" #endif #endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_osx.cc b/port/port_osx.cc new file mode 100644 index 0000000..4ab9e31 --- /dev/null +++ b/port/port_osx.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port_osx.h" + +#include +#include +#include +#include "util/logging.h" + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } + +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} +} diff --git a/port/port_osx.h b/port/port_osx.h new file mode 100644 index 0000000..5524c6c --- /dev/null +++ b/port/port_osx.h @@ -0,0 +1,125 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_OSX_H_ +#define STORAGE_LEVELDB_PORT_PORT_OSX_H_ + +#include +#include +#include +#include + +#include + +namespace leveldb { + +// The following 4 methods implemented here for the benefit of env_posix.cc. +inline size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { + return fread(a, b, c, d); +} + +inline size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { + return fwrite(a, b, c, d); +} + +inline int fflush_unlocked(FILE *f) { + return fflush(f); +} + +inline int fdatasync(int fd) { + return fsync(fd); +} + +namespace port { + +static const bool kLittleEndian = (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN); + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + void Wait(); + void Signal(); + void SignalAll(); + + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +inline void MemoryBarrier() { +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + __asm__ __volatile__("" : : : "memory"); +#else + OSMemoryBarrier(); +#endif +} + +class AtomicPointer { + private: + void* ptr_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : ptr_(p) {} + inline void* Acquire_Load() const { + void* ptr = ptr_; + MemoryBarrier(); + return ptr; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + ptr_ = v; + } + inline void* NoBarrier_Load() const { + return ptr_; + } + inline void NoBarrier_Store(void* v) { + ptr_ = v; + } +}; + +inline bool Snappy_Compress(const char* input, size_t input_length, + std::string* output) { + return false; +} + +inline bool Snappy_Uncompress(const char* input_data, size_t input_length, + std::string* output) { + return false; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} +} + +#endif // STORAGE_LEVELDB_PORT_PORT_OSX_H_ diff --git a/table/iterator.cc b/table/iterator.cc index 4ddd55f..33bc8a2 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -3,7 +3,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "leveldb/iterator.h" -#include "util/logging.h" namespace leveldb { diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index 158d3a7..d8ca2b3 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -12,10 +12,6 @@ namespace leveldb { // This can help avoid virtual function calls and also gives better // cache locality. class IteratorWrapper { - private: - Iterator* iter_; - bool valid_; - Slice key_; public: IteratorWrapper(): iter_(NULL), valid_(false) { } explicit IteratorWrapper(Iterator* iter): iter_(NULL) { @@ -56,9 +52,12 @@ class IteratorWrapper { key_ = iter_->key(); } } -}; -} + Iterator* iter_; + bool valid_; + Slice key_; +}; +} // namespace leveldb #endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ diff --git a/util/cache.cc b/util/cache.cc index d8a4426..968e6a0 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -4,6 +4,8 @@ #if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) #include +#elif defined(LEVELDB_PLATFORM_OSX) +#include #elif defined(LEVELDB_PLATFORM_CHROMIUM) #include "base/hash_tables.h" #else -- cgit v1.2.3 From ce5ba3cdead24077a2f07212d93f867a45dd79da Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Thu, 2 Jun 2011 00:00:37 +0000 Subject: sync with upstream @21627589 Minor changes: * Reformat the bodies of the iterator interface routines in IteratorWrapper to make them a bit easier to read * Switched the default in the leveldb makefile to be optimized mode, rather than debug mode * Fix build problem in chromium port git-svn-id: http://leveldb.googlecode.com/svn/trunk@30 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 12 +++++++++--- leveldb.gyp | 3 +++ table/iterator_wrapper.h | 18 +++++++++--------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 5eadd72..921b71c 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,15 @@ CC = g++ -# Uncomment one of the following to switch between debug and opt mode -#OPT = -O2 -DNDEBUG -OPT = -g2 +#----------------------------------------------- +# Uncomment exactly one of the lines labelled (A), (B), and (C) below +# to switch between compilation modes. + +OPT = -O2 -DNDEBUG # (A) Production use (optimized mode) +# OPT = -g2 # (B) Debug mode, w/ full line-level debugging symbols +# OPT = -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols +#----------------------------------------------- + UNAME := $(shell uname) diff --git a/leveldb.gyp b/leveldb.gyp index 8376592..ea634a2 100644 --- a/leveldb.gyp +++ b/leveldb.gyp @@ -35,6 +35,9 @@ # The base libary is a lightweight abstraction layer for things like # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ '../../base/base.gyp:base', + # base::LazyInstance is a template that pulls in dynamic_annotations so + # we need to explictly link in the code for dynamic_annotations. + '../../base/third_party/dynamic_annotations/dynamic_annotations.gyp:dynamic_annotations', ], 'conditions': [ ['use_snappy', { diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index d8ca2b3..9e16b3d 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -34,16 +34,16 @@ class IteratorWrapper { // Iterator interface methods - bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } - Slice value() const { assert(Valid()); return iter_->value(); } + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + Slice value() const { assert(Valid()); return iter_->value(); } // Methods below require iter() != NULL - Status status() const { assert(iter_); return iter_->status(); } - void Next() { assert(iter_); iter_->Next(); Update(); } - void Prev() { assert(iter_); iter_->Prev(); Update(); } - void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } - void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } - void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } private: void Update() { -- cgit v1.2.3 From 6635e49a8999ab5e411d5227146a3db17fac2944 Mon Sep 17 00:00:00 2001 From: "hans@chromium.org" Date: Tue, 7 Jun 2011 14:40:26 +0000 Subject: sync with upstream @21706995 Fixed race condition reported by Dave Smit (dizzyd@dizzyd,com) on the leveldb mailing list. We were not signalling waiters after a trivial move from level-0. The result was that in some cases (hard to reproduce), a write would get stuck forever waiting for the number of level-0 files to drop below its hard limit. The new code is simpler: there is just one condition variable instead of two, and the condition variable is signalled after every piece of background work finishes. Also, all compaction work (including for manual compactions) is done in the background thread, and therefore we can remove the "compacting_" variable. git-svn-id: http://leveldb.googlecode.com/svn/trunk@31 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_impl.cc | 91 +++++++++++++++++++++++++++++------------------------------ db/db_impl.h | 12 +++++--- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 9b139ce..abcc761 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -119,13 +119,12 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) db_lock_(NULL), shutting_down_(NULL), bg_cv_(&mutex_), - compacting_cv_(&mutex_), mem_(new MemTable(internal_comparator_)), imm_(NULL), logfile_(NULL), log_(NULL), bg_compaction_scheduled_(false), - compacting_(false) { + manual_compaction_(NULL) { mem_->Ref(); has_imm_.Release_Store(NULL); @@ -141,10 +140,8 @@ DBImpl::~DBImpl() { // Wait for background work to finish mutex_.Lock(); shutting_down_.Release_Store(this); // Any non-NULL value is ok - if (bg_compaction_scheduled_) { - while (bg_compaction_scheduled_) { - bg_cv_.Wait(); - } + while (bg_compaction_scheduled_) { + bg_cv_.Wait(); } mutex_.Unlock(); @@ -437,7 +434,6 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { Status DBImpl::CompactMemTable() { mutex_.AssertHeld(); assert(imm_ != NULL); - assert(compacting_); // Save the contents of the memtable as a new Table VersionEdit edit; @@ -457,7 +453,6 @@ Status DBImpl::CompactMemTable() { DeleteObsoleteFiles(); } - compacting_cv_.SignalAll(); // Wake up waiter even if there was an error return s; } @@ -466,22 +461,18 @@ void DBImpl::TEST_CompactRange( const std::string& begin, const std::string& end) { MutexLock l(&mutex_); - while (compacting_) { - compacting_cv_.Wait(); + while (manual_compaction_ != NULL) { + bg_cv_.Wait(); } - Compaction* c = versions_->CompactRange( - level, - InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek), - InternalKey(end, 0, static_cast(0))); - - if (c != NULL) { - CompactionState* compact = new CompactionState(c); - DoCompactionWork(compact); // Ignore error in test compaction - CleanupCompaction(compact); - } - - // Start any background compaction that may have been delayed by this thread + ManualCompaction manual; + manual.level = level; + manual.begin = begin; + manual.end = end; + manual_compaction_ = &manual; MaybeScheduleCompaction(); + while (manual_compaction_ == &manual) { + bg_cv_.Wait(); + } } Status DBImpl::TEST_CompactMemTable() { @@ -490,7 +481,7 @@ Status DBImpl::TEST_CompactMemTable() { if (s.ok()) { // Wait until the compaction completes while (imm_ != NULL && bg_error_.ok()) { - compacting_cv_.Wait(); + bg_cv_.Wait(); } if (imm_ != NULL) { s = bg_error_; @@ -503,11 +494,11 @@ void DBImpl::MaybeScheduleCompaction() { mutex_.AssertHeld(); if (bg_compaction_scheduled_) { // Already scheduled - } else if (compacting_) { - // Some other thread is running a compaction. Do not conflict with it. } else if (shutting_down_.Acquire_Load()) { // DB is being deleted; no more background compactions - } else if (imm_ == NULL && !versions_->NeedsCompaction()) { + } else if (imm_ == NULL && + manual_compaction_ == NULL && + !versions_->NeedsCompaction()) { // No work to be done } else { bg_compaction_scheduled_ = true; @@ -522,38 +513,41 @@ void DBImpl::BGWork(void* db) { void DBImpl::BackgroundCall() { MutexLock l(&mutex_); assert(bg_compaction_scheduled_); - if (!shutting_down_.Acquire_Load() && - !compacting_) { + if (!shutting_down_.Acquire_Load()) { BackgroundCompaction(); } bg_compaction_scheduled_ = false; - bg_cv_.SignalAll(); // Previous compaction may have produced too many files in a level, // so reschedule another compaction if needed. MaybeScheduleCompaction(); + bg_cv_.SignalAll(); } void DBImpl::BackgroundCompaction() { mutex_.AssertHeld(); - assert(!compacting_); if (imm_ != NULL) { - compacting_ = true; CompactMemTable(); - compacting_ = false; - compacting_cv_.SignalAll(); return; } - Compaction* c = versions_->PickCompaction(); - if (c == NULL) { - // Nothing to do - return; + Compaction* c; + bool is_manual = (manual_compaction_ != NULL); + if (is_manual) { + const ManualCompaction* m = manual_compaction_; + c = versions_->CompactRange( + m->level, + InternalKey(m->begin, kMaxSequenceNumber, kValueTypeForSeek), + InternalKey(m->end, 0, static_cast(0))); + } else { + c = versions_->PickCompaction(); } Status status; - if (c->IsTrivialMove()) { + if (c == NULL) { + // Nothing to do + } else if (!is_manual && c->IsTrivialMove()) { // Move file to next level assert(c->num_input_files(0) == 1); FileMetaData* f = c->input(0, 0); @@ -561,11 +555,13 @@ void DBImpl::BackgroundCompaction() { c->edit()->AddFile(c->level() + 1, f->number, f->file_size, f->smallest, f->largest); status = versions_->LogAndApply(c->edit()); - Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n", + VersionSet::LevelSummaryStorage tmp; + Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", static_cast(f->number), c->level() + 1, static_cast(f->file_size), - status.ToString().c_str()); + status.ToString().c_str(), + versions_->LevelSummary(&tmp)); } else { CompactionState* compact = new CompactionState(c); status = DoCompactionWork(compact); @@ -584,6 +580,11 @@ void DBImpl::BackgroundCompaction() { bg_error_ = status; } } + + if (is_manual) { + // Mark it as done + manual_compaction_ = NULL; + } } void DBImpl::CleanupCompaction(CompactionState* compact) { @@ -734,7 +735,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } // Release mutex while we're actually doing the compaction work - compacting_ = true; mutex_.Unlock(); Iterator* input = versions_->MakeInputIterator(compact->compaction); @@ -751,7 +751,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { mutex_.Lock(); if (imm_ != NULL) { CompactMemTable(); - compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary } mutex_.Unlock(); imm_micros += (env_->NowMicros() - imm_start); @@ -867,8 +867,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { if (status.ok()) { status = InstallCompactionResults(compact); } - compacting_ = false; - compacting_cv_.SignalAll(); VersionSet::LevelSummaryStorage tmp; Log(env_, options_.info_log, "compacted to: %s", versions_->LevelSummary(&tmp)); @@ -1038,10 +1036,11 @@ Status DBImpl::MakeRoomForWrite(bool force) { } else if (imm_ != NULL) { // We have filled up the current memtable, but the previous // one is still being compacted, so we wait. - compacting_cv_.Wait(); + bg_cv_.Wait(); } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { // There are too many level-0 files. - compacting_cv_.Wait(); + Log(env_, options_.info_log, "waiting...\n"); + bg_cv_.Wait(); } else { // Attempt to switch to a new memtable and trigger compaction of old assert(versions_->PrevLogNumber() == 0); diff --git a/db/db_impl.h b/db/db_impl.h index c23ae00..84ce154 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -119,8 +119,7 @@ class DBImpl : public DB { // State below is protected by mutex_ port::Mutex mutex_; port::AtomicPointer shutting_down_; - port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_ - port::CondVar compacting_cv_; // Signalled when !compacting_ + port::CondVar bg_cv_; // Signalled when background work finishes MemTable* mem_; MemTable* imm_; // Memtable being compacted port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ @@ -135,8 +134,13 @@ class DBImpl : public DB { // Has a background compaction been scheduled or is running? bool bg_compaction_scheduled_; - // Is there a compaction running? - bool compacting_; + // Information for a manual compaction + struct ManualCompaction { + int level; + std::string begin; + std::string end; + }; + ManualCompaction* manual_compaction_; VersionSet* versions_; -- cgit v1.2.3 From 8cd4ab8303620197cf24282ae8639060efbb326e Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Wed, 22 Jun 2011 02:36:45 +0000 Subject: A number of smaller fixes and performance improvements: - Implemented Get() directly instead of building on top of a full merging iterator stack. This speeds up the "readrandom" benchmark by up to 15-30%. - Fixed an opensource compilation problem. Added --db= flag to control where the database is placed. - Automatically compact a file when we have done enough overlapping seeks to that file. - Fixed a performance bug where we would read from at least one file in a level even if none of the files overlapped the key being read. - Makefile fix for Mac OSX installations that have XCode 4 without XCode 3. - Unified the two occurrences of binary search in a file-list into one routine. - Found and fixed a bug where we would unnecessarily search the last file when looking for a key larger than all data in the level. - A fix to avoid the need for trivial move compactions and therefore gets rid of two out of five syncs in "fillseq". - Removed the MANIFEST file write when switching to a new memtable/log-file for a 10-20% improvement on fill speed on ext4. - Adding a SNAPPY setting in the Makefile for folks who have Snappy installed. Snappy compresses values and speeds up writes. git-svn-id: http://leveldb.googlecode.com/svn/trunk@32 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 28 ++++- TODO | 7 +- db/builder.cc | 6 +- db/builder.h | 10 +- db/corruption_test.cc | 31 +++--- db/db_bench.cc | 19 ++-- db/db_impl.cc | 135 +++++++++++++++++------- db/db_impl.h | 3 +- db/db_test.cc | 219 +++++++++++++++++++++++++++++++++++---- db/dbformat.cc | 19 ++++ db/dbformat.h | 40 +++++++ db/memtable.cc | 37 +++++++ db/memtable.h | 6 ++ db/repair.cc | 6 +- db/version_edit.h | 3 +- db/version_set.cc | 281 ++++++++++++++++++++++++++++++++++++++++++-------- db/version_set.h | 49 ++++++++- port/port_posix.h | 22 +++- table/table_test.cc | 14 ++- 19 files changed, 783 insertions(+), 152 deletions(-) diff --git a/Makefile b/Makefile index 921b71c..84f77ab 100644 --- a/Makefile +++ b/Makefile @@ -28,9 +28,22 @@ PLATFORM_CFLAGS = -DLEVELDB_PLATFORM_POSIX -std=c++0x PORT_MODULE = port_posix.o endif # UNAME -CFLAGS = -c -I. -I./include $(PLATFORM_CFLAGS) $(OPT) +# Set 'SNAPPY' to 1 if you have the Snappy compression library +# installed and want to enable its use in LevelDB +# (see http://code.google.com/p/snappy/) +SNAPPY=0 + +ifeq ($(SNAPPY), 0) +SNAPPY_CFLAGS= +SNAPPY_LDFLAGS= +else +SNAPPY_CFLAGS=-DSNAPPY +SNAPPY_LDFLAGS=-lsnappy +endif -LDFLAGS=-lpthread +CFLAGS = -c -I. -I./include $(PLATFORM_CFLAGS) $(OPT) $(SNAPPY_CFLAGS) + +LDFLAGS=-lpthread $(SNAPPY_LDFLAGS) LIBOBJECTS = \ ./db/builder.o \ @@ -85,6 +98,7 @@ TESTS = \ skiplist_test \ table_test \ version_edit_test \ + version_set_test \ write_batch_test PROGRAMS = db_bench $(TESTS) @@ -151,17 +165,23 @@ skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ +version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ ifeq ($(PLATFORM), IOS) # For iOS, create universal object files to be used on both the simulator and # a device. +SIMULATORROOT=/Developer/Platforms/iPhoneSimulator.platform/Developer +DEVICEROOT=/Developer/Platforms/iPhoneOS.platform/Developer +IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version CFBundleShortVersionString) .cc.o: mkdir -p ios-x86/$(dir $@) - $(CC) $(CFLAGS) -isysroot /Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator4.3.sdk -arch i686 $< -o ios-x86/$@ + $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(CC) $(CFLAGS) -isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.3.sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ + $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ else .cc.o: diff --git a/TODO b/TODO index ce81439..9130b6a 100644 --- a/TODO +++ b/TODO @@ -8,7 +8,6 @@ db object stores, etc. can be done in the background anyway, so probably not that important. -api changes: -- Make it wrappable - -Faster Get implementation +After a range is completely deleted, what gets rid of the +corresponding files if we do no future changes to that range. Make +the conditions for triggering compactions fire in more situations? diff --git a/db/builder.cc b/db/builder.cc index 9f132d7..34a7b87 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -19,8 +19,7 @@ Status BuildTable(const std::string& dbname, const Options& options, TableCache* table_cache, Iterator* iter, - FileMetaData* meta, - VersionEdit* edit) { + FileMetaData* meta) { Status s; meta->file_size = 0; iter->SeekToFirst(); @@ -79,8 +78,7 @@ Status BuildTable(const std::string& dbname, } if (s.ok() && meta->file_size > 0) { - edit->AddFile(0, meta->number, meta->file_size, - meta->smallest, meta->largest); + // Keep it } else { env->DeleteFile(fname); } diff --git a/db/builder.h b/db/builder.h index 5dd17b6..b2aeabf 100644 --- a/db/builder.h +++ b/db/builder.h @@ -19,17 +19,15 @@ class VersionEdit; // Build a Table file from the contents of *iter. The generated file // will be named according to meta->number. On success, the rest of -// *meta will be filled with metadata about the generated table, and -// the file information will be added to *edit. If no data is present -// in *iter, meta->file_size will be set to zero, and no Table file -// will be produced. +// *meta will be filled with metadata about the generated table. +// If no data is present in *iter, meta->file_size will be set to +// zero, and no Table file will be produced. extern Status BuildTable(const std::string& dbname, Env* env, const Options& options, TableCache* table_cache, Iterator* iter, - FileMetaData* meta, - VersionEdit* edit); + FileMetaData* meta); } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 12d176e..8015101 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -27,13 +27,12 @@ static const int kValueSize = 1000; class CorruptionTest { public: test::ErrorEnv env_; - Random rnd_; std::string dbname_; Cache* tiny_cache_; Options options_; DB* db_; - CorruptionTest() : rnd_(test::RandomSeed()) { + CorruptionTest() { tiny_cache_ = NewLRUCache(100); options_.env = &env_; dbname_ = test::TmpDir() + "/db_test"; @@ -122,15 +121,17 @@ class CorruptionTest { ASSERT_OK(env_.GetChildren(dbname_, &filenames)); uint64_t number; FileType type; - std::vector candidates; + std::string fname; + int picked_number = -1; for (int i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && - type == filetype) { - candidates.push_back(dbname_ + "/" + filenames[i]); + type == filetype && + int(number) > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; } } - ASSERT_TRUE(!candidates.empty()) << filetype; - std::string fname = candidates[rnd_.Uniform(candidates.size())]; + ASSERT_TRUE(!fname.empty()) << filetype; struct stat sbuf; if (stat(fname.c_str(), &sbuf) != 0) { @@ -239,8 +240,6 @@ TEST(CorruptionTest, TableFileIndexData) { Build(10000); // Enough to build multiple Tables DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); Corrupt(kTableFile, -2000, 500); Reopen(); @@ -296,7 +295,8 @@ TEST(CorruptionTest, CompactionInputError) { Build(10); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); + const int last = config::kNumLevels - 1; + ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last))); Corrupt(kTableFile, 100, 1); Check(9, 9); @@ -304,8 +304,6 @@ TEST(CorruptionTest, CompactionInputError) { // Force compactions by writing lots of values Build(10000); Check(10000, 10000); - dbi->TEST_CompactRange(0, "", "~"); - ASSERT_EQ(0, Property("leveldb.num-files-at-level0")); } TEST(CorruptionTest, CompactionInputErrorParanoid) { @@ -313,9 +311,16 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) { options.paranoid_checks = true; options.write_buffer_size = 1048576; Reopen(&options); + DBImpl* dbi = reinterpret_cast(db_); + + // Fill levels >= 1 so memtable compaction outputs to level 1 + for (int level = 1; level < config::kNumLevels; level++) { + dbi->Put(WriteOptions(), "", "begin"); + dbi->Put(WriteOptions(), "~", "end"); + dbi->TEST_CompactMemTable(); + } Build(10); - DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); diff --git a/db/db_bench.cc b/db/db_bench.cc index b24179d..53b8c53 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -86,6 +86,9 @@ static int FLAGS_open_files = 0; // benchmark will fail. static bool FLAGS_use_existing_db = false; +// Use the db with the following name. +static const char* FLAGS_db = "/tmp/dbbench"; + namespace leveldb { // Helper for quickly generating random data. @@ -318,14 +321,14 @@ class Benchmark { bytes_(0), rand_(301) { std::vector files; - Env::Default()->GetChildren("/tmp/dbbench", &files); + Env::Default()->GetChildren(FLAGS_db, &files); for (int i = 0; i < files.size(); i++) { if (Slice(files[i]).starts_with("heap-")) { - Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]); + Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]); } } if (!FLAGS_use_existing_db) { - DestroyDB("/tmp/dbbench", Options()); + DestroyDB(FLAGS_db, Options()); } } @@ -364,7 +367,7 @@ class Benchmark { Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1); } else if (name == Slice("fillsync")) { write_options.sync = true; - Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); + Write(write_options, RANDOM, FRESH, num_ / 1000, FLAGS_value_size, 1); } else if (name == Slice("fill100K")) { Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); } else if (name == Slice("readseq")) { @@ -490,7 +493,7 @@ class Benchmark { options.create_if_missing = !FLAGS_use_existing_db; options.block_cache = cache_; options.write_buffer_size = FLAGS_write_buffer_size; - Status s = DB::Open(options, "/tmp/dbbench", &db_); + Status s = DB::Open(options, FLAGS_db, &db_); if (!s.ok()) { fprintf(stderr, "open error: %s\n", s.ToString().c_str()); exit(1); @@ -506,7 +509,7 @@ class Benchmark { } delete db_; db_ = NULL; - DestroyDB("/tmp/dbbench", Options()); + DestroyDB(FLAGS_db, Options()); Open(); Start(); // Do not count time taken to destroy/open } @@ -617,7 +620,7 @@ class Benchmark { void HeapProfile() { char fname[100]; - snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_); + snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_); WritableFile* file; Status s = Env::Default()->NewWritableFile(fname, &file); if (!s.ok()) { @@ -665,6 +668,8 @@ int main(int argc, char** argv) { FLAGS_cache_size = n; } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) { FLAGS_open_files = n; + } else if (strncmp(argv[i], "--db=", 5) == 0) { + FLAGS_db = argv[i] + 5; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/db/db_impl.cc b/db/db_impl.cc index abcc761..7556d5a 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -122,6 +122,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) mem_(new MemTable(internal_comparator_)), imm_(NULL), logfile_(NULL), + logfile_number_(0), log_(NULL), bg_compaction_scheduled_(false), manual_compaction_(NULL) { @@ -219,7 +220,7 @@ void DBImpl::DeleteObsoleteFiles() { bool keep = true; switch (type) { case kLogFile: - keep = ((number == versions_->LogNumber()) || + keep = ((number >= versions_->LogNumber()) || (number == versions_->PrevLogNumber())); break; case kDescriptorFile: @@ -287,14 +288,39 @@ Status DBImpl::Recover(VersionEdit* edit) { s = versions_->Recover(); if (s.ok()) { - // Recover from the log files named in the descriptor SequenceNumber max_sequence(0); - if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log - s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence); + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that PrevLogNumber() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of leveldb. + const uint64_t min_log = versions_->LogNumber(); + const uint64_t prev_log = versions_->PrevLogNumber(); + std::vector filenames; + s = env_->GetChildren(dbname_, &filenames); + if (!s.ok()) { + return s; + } + uint64_t number; + FileType type; + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) + && type == kLogFile + && ((number >= min_log) || (number == prev_log))) { + logs.push_back(number); + } } - if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state - s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence); + + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + for (size_t i = 0; i < logs.size(); i++) { + s = RecoverLogFile(logs[i], edit, &max_sequence); } + if (s.ok()) { if (versions_->LastSequence() < max_sequence) { versions_->SetLastSequence(max_sequence); @@ -378,7 +404,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, } if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { - status = WriteLevel0Table(mem, edit); + status = WriteLevel0Table(mem, edit, NULL); if (!status.ok()) { // Reflect errors immediately so that conditions like full // file-systems cause the DB::Open() to fail. @@ -390,7 +416,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, } if (status.ok() && mem != NULL) { - status = WriteLevel0Table(mem, edit); + status = WriteLevel0Table(mem, edit, NULL); // Reflect errors immediately so that conditions like full // file-systems cause the DB::Open() to fail. } @@ -400,7 +426,8 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, return status; } -Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { +Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, + Version* base) { mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; @@ -413,7 +440,7 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { Status s; { mutex_.Unlock(); - s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit); + s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); mutex_.Lock(); } @@ -424,10 +451,26 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) { delete iter; pending_outputs_.erase(meta.number); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.file_size > 0) { + if (base != NULL && !base->OverlapInLevel(0, meta.smallest, meta.largest)) { + // Push to largest level we can without causing overlaps + while (level + 1 < config::kNumLevels && + !base->OverlapInLevel(level + 1, meta.smallest, meta.largest)) { + level++; + } + } + edit->AddFile(level, meta.number, meta.file_size, + meta.smallest, meta.largest); + } + CompactionStats stats; stats.micros = env_->NowMicros() - start_micros; stats.bytes_written = meta.file_size; - stats_[0].Add(stats); + stats_[level].Add(stats); return s; } @@ -437,11 +480,19 @@ Status DBImpl::CompactMemTable() { // Save the contents of the memtable as a new Table VersionEdit edit; - Status s = WriteLevel0Table(imm_, &edit); + Version* base = versions_->current(); + base->Ref(); + Status s = WriteLevel0Table(imm_, &edit, base); + base->Unref(); + + if (s.ok() && shutting_down_.Acquire_Load()) { + s = Status::IOError("Deleting DB during memtable compaction"); + } // Replace immutable memtable with the generated Table if (s.ok()) { edit.SetPrevLogNumber(0); + edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed s = versions_->LogAndApply(&edit); } @@ -460,6 +511,9 @@ void DBImpl::TEST_CompactRange( int level, const std::string& begin, const std::string& end) { + assert(level >= 0); + assert(level + 1 < config::kNumLevels); + MutexLock l(&mutex_); while (manual_compaction_ != NULL) { bg_cv_.Wait(); @@ -934,22 +988,38 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { Status DBImpl::Get(const ReadOptions& options, const Slice& key, std::string* value) { - // TODO(opt): faster implementation - Iterator* iter = NewIterator(options); - iter->Seek(key); - bool found = false; - if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) { - Slice v = iter->value(); - value->assign(v.data(), v.size()); - found = true; - } - // Non-OK iterator status trumps everything else - Status result = iter->status(); - if (result.ok() && !found) { - result = Status::NotFound(Slice()); // Use an empty error message for speed + Status s; + MutexLock l(&mutex_); + SequenceNumber snapshot; + if (options.snapshot != NULL) { + snapshot = reinterpret_cast(options.snapshot)->number_; + } else { + snapshot = versions_->LastSequence(); } - delete iter; - return result; + + // First look in the memtable, then in the immutable memtable (if any). + LookupKey lkey(key, snapshot); + if (mem_->Get(lkey, value, &s)) { + return s; + } + if (imm_ != NULL && imm_->Get(lkey, value, &s)) { + return s; + } + + // Not in memtable(s); try live files in level order + Version* current = versions_->current(); + current->Ref(); + Version::GetStats stats; + { // Unlock while reading from files + mutex_.Unlock(); + s = current->Get(options, lkey, value, &stats); + mutex_.Lock(); + } + if (current->UpdateStats(stats)) { + MaybeScheduleCompaction(); + } + current->Unref(); + return s; } Iterator* DBImpl::NewIterator(const ReadOptions& options) { @@ -1050,18 +1120,10 @@ Status DBImpl::MakeRoomForWrite(bool force) { if (!s.ok()) { break; } - VersionEdit edit; - edit.SetPrevLogNumber(versions_->LogNumber()); - edit.SetLogNumber(new_log_number); - s = versions_->LogAndApply(&edit); - if (!s.ok()) { - delete lfile; - env_->DeleteFile(LogFileName(dbname_, new_log_number)); - break; - } delete log_; delete logfile_; logfile_ = lfile; + logfile_number_ = new_log_number; log_ = new log::Writer(lfile); imm_ = mem_; has_imm_.Release_Store(imm_); @@ -1183,6 +1245,7 @@ Status DB::Open(const Options& options, const std::string& dbname, if (s.ok()) { edit.SetLogNumber(new_log_number); impl->logfile_ = lfile; + impl->logfile_number_ = new_log_number; impl->log_ = new log::Writer(lfile); s = impl->versions_->LogAndApply(&edit); } diff --git a/db/db_impl.h b/db/db_impl.h index 84ce154..f11ea55 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -85,7 +85,7 @@ class DBImpl : public DB { VersionEdit* edit, SequenceNumber* max_sequence); - Status WriteLevel0Table(MemTable* mem, VersionEdit* edit); + Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base); Status MakeRoomForWrite(bool force /* compact even if there is room? */); @@ -124,6 +124,7 @@ class DBImpl : public DB { MemTable* imm_; // Memtable being compacted port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ WritableFile* logfile_; + uint64_t logfile_number_; log::Writer* log_; SnapshotList snapshots_; diff --git a/db/db_test.cc b/db/db_test.cc index 42e70cf..d5d60cd 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -21,15 +21,57 @@ static std::string RandomString(Random* rnd, int len) { return r; } +// Special Env used to delay background operations +class SpecialEnv : public EnvWrapper { + public: + // sstable Sync() calls are blocked while this pointer is non-NULL. + port::AtomicPointer delay_sstable_sync_; + + explicit SpecialEnv(Env* base) : EnvWrapper(base) { + delay_sstable_sync_.Release_Store(NULL); + } + + Status NewWritableFile(const std::string& f, WritableFile** r) { + class SSTableFile : public WritableFile { + private: + SpecialEnv* env_; + WritableFile* base_; + + public: + SSTableFile(SpecialEnv* env, WritableFile* base) + : env_(env), + base_(base) { + } + Status Append(const Slice& data) { return base_->Append(data); } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + while (env_->delay_sstable_sync_.Acquire_Load() != NULL) { + env_->SleepForMicroseconds(100000); + } + return base_->Sync(); + } + }; + + Status s = target()->NewWritableFile(f, r); + if (s.ok()) { + if (strstr(f.c_str(), ".sst") != NULL) { + *r = new SSTableFile(this, *r); + } + } + return s; + } +}; + class DBTest { public: std::string dbname_; - Env* env_; + SpecialEnv* env_; DB* db_; Options last_options_; - DBTest() : env_(Env::Default()) { + DBTest() : env_(new SpecialEnv(Env::Default())) { dbname_ = test::TmpDir() + "/db_test"; DestroyDB(dbname_, Options()); db_ = NULL; @@ -39,6 +81,7 @@ class DBTest { ~DBTest() { delete db_; DestroyDB(dbname_, Options()); + delete env_; } DBImpl* dbfull() { @@ -142,6 +185,14 @@ class DBTest { return atoi(property.c_str()); } + int TotalTableFiles() { + int result = 0; + for (int level = 0; level < config::kNumLevels; level++) { + result += NumTableFilesAtLevel(level); + } + return result; + } + uint64_t Size(const Slice& start, const Slice& limit) { Range r(start, limit); uint64_t size; @@ -162,6 +213,16 @@ class DBTest { } } + // Prevent pushing of new sstables into deeper levels by adding + // tables that cover a specified range to all levels. + void FillLevels(const std::string& smallest, const std::string& largest) { + for (int level = 0; level < config::kNumLevels; level++) { + Put(smallest, "begin"); + Put(largest, "end"); + dbfull()->TEST_CompactMemTable(); + } + } + void DumpFileCounts(const char* label) { fprintf(stderr, "---\n%s:\n", label); fprintf(stderr, "maxoverlap: %lld\n", @@ -209,6 +270,80 @@ TEST(DBTest, PutDeleteGet) { ASSERT_EQ("NOT_FOUND", Get("foo")); } +TEST(DBTest, GetFromImmutableLayer) { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put("k1", std::string(100000, 'x')); // Fill memtable + Put("k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls +} + +TEST(DBTest, GetFromVersions) { + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("v1", Get("foo")); +} + +TEST(DBTest, GetSnapshot) { + // Try with both a short key and a long key + for (int i = 0; i < 2; i++) { + std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); + ASSERT_OK(Put(key, "v1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Put(key, "v2")); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + db_->ReleaseSnapshot(s1); + } +} + +TEST(DBTest, GetLevel0Ordering) { + // Check that we process level-0 files in correct order. The code + // below generates two level-0 files where the earlier one comes + // before the later one in the level-0 file list since the earlier + // one has a smaller "smallest" key. + ASSERT_OK(Put("bar", "b")); + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_CompactMemTable(); + ASSERT_OK(Put("foo", "v2")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("v2", Get("foo")); +} + +TEST(DBTest, GetOrderedByLevels) { + ASSERT_OK(Put("foo", "v1")); + Compact("a", "z"); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("v2", Get("foo")); +} + +TEST(DBTest, GetPicksCorrectFile) { + // Arrange to have multiple files in a non-level-0 level. + ASSERT_OK(Put("a", "va")); + Compact("a", "b"); + ASSERT_OK(Put("x", "vx")); + Compact("x", "y"); + ASSERT_OK(Put("f", "vf")); + Compact("f", "g"); + ASSERT_EQ("va", Get("a")); + ASSERT_EQ("vf", Get("f")); + ASSERT_EQ("vx", Get("x")); +} + TEST(DBTest, IterEmpty) { Iterator* iter = db_->NewIterator(ReadOptions()); @@ -413,6 +548,27 @@ TEST(DBTest, RecoveryWithEmptyLog) { ASSERT_EQ("v3", Get("foo")); } +// Check that writes done during a memtable compaction are recovered +// if the database is shutdown during the memtable compaction. +TEST(DBTest, RecoverDuringMemtableCompaction) { + Options options; + options.env = env_; + options.write_buffer_size = 1000000; + Reopen(&options); + + // Trigger a long memtable compaction and reopen the database during it + ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put("bar", "v2")); // Goes to new log file + + Reopen(&options); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get("big1")); + ASSERT_EQ(std::string(1000, 'y'), Get("big2")); +} + static std::string Key(int i) { char buf[100]; snprintf(buf, sizeof(buf), "key%06d", i); @@ -426,11 +582,11 @@ TEST(DBTest, MinorCompactionsHappen) { const int N = 500; - int starting_num_tables = NumTableFilesAtLevel(0); + int starting_num_tables = TotalTableFiles(); for (int i = 0; i < N; i++) { ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); } - int ending_num_tables = NumTableFilesAtLevel(0); + int ending_num_tables = TotalTableFiles(); ASSERT_GT(ending_num_tables, starting_num_tables); for (int i = 0; i < N; i++) { @@ -499,6 +655,8 @@ TEST(DBTest, SparseMerge) { options.compression = kNoCompression; Reopen(&options); + FillLevels("A", "Z"); + // Suppose there is: // small amount of data with prefix A // large amount of data with prefix B @@ -514,7 +672,8 @@ TEST(DBTest, SparseMerge) { Put(key, value); } Put("C", "vc"); - Compact("", "z"); + dbfull()->TEST_CompactMemTable(); + dbfull()->TEST_CompactRange(0, "A", "Z"); // Make sparse update Put("A", "va2"); @@ -675,6 +834,8 @@ TEST(DBTest, Snapshot) { TEST(DBTest, HiddenValuesAreRemoved) { Random rnd(301); + FillLevels("a", "z"); + std::string big = RandomString(&rnd, 50000); Put("foo", big); Put("pastfoo", "v"); @@ -702,40 +863,54 @@ TEST(DBTest, HiddenValuesAreRemoved) { TEST(DBTest, DeletionMarkers1) { Put("foo", "v1"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + const int last = config::kNumLevels - 1; + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + + // Place a table at level last-1 to prevent merging with preceding mutation + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); + ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); + Delete("foo"); Put("foo", "v2"); ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(last-2, "", "z"); // DEL eliminated, but v1 remains because we aren't compacting that level // (DEL can be eliminated because v2 hides v1). ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(1, "", "z"); - // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. - // (as is v1). + dbfull()->TEST_CompactRange(last-1, "", "z"); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); } TEST(DBTest, DeletionMarkers2) { Put("foo", "v1"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); - dbfull()->TEST_CompactRange(0, "", "z"); - dbfull()->TEST_CompactRange(1, "", "z"); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file + const int last = config::kNumLevels - 1; + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + + // Place a table at level last-1 to prevent merging with preceding mutation + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); + ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); + Delete("foo"); ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(0, "", "z"); - // DEL kept: L2 file overlaps + dbfull()->TEST_CompactRange(last-2, "", "z"); + // DEL kept: "last" file overlaps ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(1, "", "z"); - // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed. - // (as is v1). + dbfull()->TEST_CompactRange(last-1, "", "z"); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); } diff --git a/db/dbformat.cc b/db/dbformat.cc index c12c138..af2e077 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -84,4 +84,23 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const { } } +LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { + size_t usize = user_key.size(); + size_t needed = usize + 13; // A conservative estimate + char* dst; + if (needed <= sizeof(space_)) { + dst = space_; + } else { + dst = new char[needed]; + } + start_ = dst; + dst = EncodeVarint32(dst, usize + 8); + kstart_ = dst; + memcpy(dst, user_key.data(), usize); + dst += usize; + EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); + dst += 8; + end_ = dst; +} + } diff --git a/db/dbformat.h b/db/dbformat.h index 89c4afb..97491bc 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -160,6 +160,46 @@ inline bool ParseInternalKey(const Slice& internal_key, return (c <= static_cast(kTypeValue)); } +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& user_key, SequenceNumber sequence); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { return Slice(start_, end_ - start_); } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } + + // Return the user key + Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + } #endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/db/memtable.cc b/db/memtable.cc index 687900a..4555abb 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -105,4 +105,41 @@ void MemTable::Add(SequenceNumber s, ValueType type, table_.Insert(buf); } +bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { + Slice memkey = key.memtable_key(); + Table::Iterator iter(&table_); + iter.Seek(memkey.data()); + if (iter.Valid()) { + // entry format is: + // klength varint32 + // userkey char[klength] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter.key(); + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(key_ptr, key_length - 8), + key.user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + value->assign(v.data(), v.size()); + return true; + } + case kTypeDeletion: + *s = Status::NotFound(Slice()); + return true; + } + } + } + return false; +} + } diff --git a/db/memtable.h b/db/memtable.h index 2e9bd61..1898b5e 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -57,6 +57,12 @@ class MemTable { const Slice& key, const Slice& value); + // If memtable contains a value for key, store it in *value and return true. + // If memtable contains a deletion for key, store a NotFound() error + // in *status and return true. + // Else, return false. + bool Get(const LookupKey& key, std::string* value, Status* s); + private: ~MemTable(); // Private since only Unref() should be used to delete it diff --git a/db/repair.cc b/db/repair.cc index 4b57169..ae1b136 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -212,14 +212,12 @@ class Repairer { } delete lfile; - // We ignore any version edits generated by the conversion to a Table + // Do not record a version edit for this conversion to a Table // since ExtractMetaData() will also generate edits. - VersionEdit skipped; FileMetaData meta; meta.number = next_file_number_++; Iterator* iter = mem->NewIterator(); - status = BuildTable(dbname_, env_, options_, table_cache_, iter, - &meta, &skipped); + status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); delete iter; mem->Unref(); mem = NULL; diff --git a/db/version_edit.h b/db/version_edit.h index ab874da..a069893 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -16,12 +16,13 @@ class VersionSet; struct FileMetaData { int refs; + int allowed_seeks; // Seeks allowed until compaction uint64_t number; uint64_t file_size; // File size in bytes InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table - FileMetaData() : refs(0), file_size(0) { } + FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { } }; class VersionEdit { diff --git a/db/version_set.cc b/db/version_set.cc index f64ac8d..54342e4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -75,6 +75,37 @@ Version::~Version() { } } +int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key) { + uint32_t left = 0; + uint32_t right = files.size(); + while (left < right) { + uint32_t mid = (left + right) / 2; + const FileMetaData* f = files[mid]; + if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) { + // Key at "mid.largest" is < "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + return right; +} + +bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + const std::vector& files, + const InternalKey& smallest, + const InternalKey& largest) { + const int index = FindFile(icmp, files, smallest.Encode()); + return ((index < files.size()) && + icmp.Compare(largest, files[index]->smallest) >= 0); +} + // An internal iterator. For a given version/level pair, yields // information about the files in the level. For a given entry, key() // is the largest key that occurs in the file, and value() is an @@ -92,22 +123,7 @@ class Version::LevelFileNumIterator : public Iterator { return index_ < flist_->size(); } virtual void Seek(const Slice& target) { - uint32_t left = 0; - uint32_t right = flist_->size() - 1; - while (left < right) { - uint32_t mid = (left + right) / 2; - int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target); - if (cmp < 0) { - // Key at "mid.largest" is < than "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - index_ = left; + index_ = FindFile(icmp_, *flist_, target); } virtual void SeekToFirst() { index_ = 0; } virtual void SeekToLast() { @@ -185,6 +201,144 @@ void Version::AddIterators(const ReadOptions& options, } } +// If "*iter" points at a value or deletion for user_key, store +// either the value, or a NotFound error and return true. +// Else return false. +static bool GetValue(Iterator* iter, const Slice& user_key, + std::string* value, + Status* s) { + if (!iter->Valid()) { + return false; + } + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter->key(), &parsed_key)) { + *s = Status::Corruption("corrupted key for ", user_key); + return true; + } + if (parsed_key.user_key != user_key) { + return false; + } + switch (parsed_key.type) { + case kTypeDeletion: + *s = Status::NotFound(Slice()); // Use an empty error message for speed + break; + case kTypeValue: { + Slice v = iter->value(); + value->assign(v.data(), v.size()); + break; + } + } + return true; +} + +static bool NewestFirst(FileMetaData* a, FileMetaData* b) { + return a->number > b->number; +} + +Status Version::Get(const ReadOptions& options, + const LookupKey& k, + std::string* value, + GetStats* stats) { + Slice ikey = k.internal_key(); + Slice user_key = k.user_key(); + const Comparator* ucmp = vset_->icmp_.user_comparator(); + Status s; + + stats->seek_file = NULL; + stats->seek_file_level = -1; + FileMetaData* last_file_read = NULL; + + // We can search level-by-level since entries never hop across + // levels. Therefore we are guaranteed that if we find data + // in an smaller level, later levels are irrelevant. + std::vector tmp; + FileMetaData* tmp2; + for (int level = 0; level < config::kNumLevels; level++) { + size_t num_files = files_[level].size(); + if (num_files == 0) continue; + + // Get the list of files to search in this level + FileMetaData* const* files = &files_[level][0]; + if (level == 0) { + // Level-0 files may overlap each other. Find all files that + // overlap user_key and process them in order from newest to oldest. + tmp.reserve(num_files); + for (int i = 0; i < num_files; i++) { + FileMetaData* f = files[i]; + if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 && + ucmp->Compare(user_key, f->largest.user_key()) <= 0) { + tmp.push_back(f); + } + } + if (tmp.empty()) continue; + + std::sort(tmp.begin(), tmp.end(), NewestFirst); + files = &tmp[0]; + num_files = tmp.size(); + } else { + // Binary search to find earliest index whose largest key >= ikey. + uint32_t index = FindFile(vset_->icmp_, files_[level], ikey); + if (index >= num_files) { + files = NULL; + num_files = 0; + } else { + tmp2 = files[index]; + if (ucmp->Compare(user_key, tmp2->smallest.user_key()) < 0) { + // All of "tmp2" is past any data for user_key + files = NULL; + num_files = 0; + } else { + files = &tmp2; + num_files = 1; + } + } + } + + for (int i = 0; i < num_files; ++i) { + if (last_file_read != NULL && stats->seek_file == NULL) { + // We have had more than one seek for this read. Charge the 1st file. + stats->seek_file = last_file_read; + stats->seek_file_level = (i == 0 ? level - 1 : level); + } + + FileMetaData* f = files[i]; + last_file_read = f; + + Iterator* iter = vset_->table_cache_->NewIterator( + options, + f->number, + f->file_size); + iter->Seek(ikey); + const bool done = GetValue(iter, user_key, value, &s); + if (!iter->status().ok()) { + s = iter->status(); + delete iter; + return s; + } else { + delete iter; + if (done) { + return s; + } + } + } + } + + return Status::NotFound(Slice()); // Use an empty error message for speed +} + +bool Version::UpdateStats(const GetStats& stats) { + FileMetaData* f = stats.seek_file; + if (f != NULL) { + f->allowed_seeks--; + if (f->allowed_seeks <= 0 && file_to_compact_ == NULL) { + file_to_compact_ = f; + file_to_compact_level_ = stats.seek_file_level; + return true; + } + } + return false; +} + void Version::Ref() { ++refs_; } @@ -198,13 +352,22 @@ void Version::Unref() { } } +bool Version::OverlapInLevel(int level, + const InternalKey& smallest, + const InternalKey& largest) { + return SomeFileOverlapsRange(vset_->icmp_, files_[level], smallest, largest); +} + std::string Version::DebugString() const { std::string r; for (int level = 0; level < config::kNumLevels; level++) { - // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g'] - r.append("level "); + // E.g., + // --- level 1 --- + // 17:123['a' .. 'd'] + // 20:43['e' .. 'g'] + r.append("--- level "); AppendNumberTo(&r, level); - r.push_back(':'); + r.append(" ---\n"); const std::vector& files = files_[level]; for (size_t i = 0; i < files.size(); i++) { r.push_back(' '); @@ -215,9 +378,8 @@ std::string Version::DebugString() const { AppendEscapedStringTo(&r, files[i]->smallest.Encode()); r.append("' .. '"); AppendEscapedStringTo(&r, files[i]->largest.Encode()); - r.append("']"); + r.append("']\n"); } - r.push_back('\n'); } return r; } @@ -305,6 +467,23 @@ class VersionSet::Builder { const int level = edit->new_files_[i].first; FileMetaData* f = new FileMetaData(edit->new_files_[i].second); f->refs = 1; + + // We arrange to automatically compact this file after + // a certain number of seeks. Let's assume: + // (1) One seek costs 10ms + // (2) Writing or reading 1MB costs 10ms (100MB/s) + // (3) A compaction of 1MB does 25MB of IO: + // 1MB read from this level + // 10-12MB read from next level (boundaries may be misaligned) + // 10-12MB written to next level + // This implies that 25 seeks cost the same as the compaction + // of 1MB of data. I.e., one seek costs approximately the + // same as the compaction of 40KB of data. We are a little + // conservative and allow approximately one seek for every 16KB + // of data before triggering a compaction. + f->allowed_seeks = (f->file_size / 16384); + if (f->allowed_seeks < 100) f->allowed_seeks = 100; + levels_[level].deleted_files.erase(f->number); levels_[level].added_files->insert(f); } @@ -363,8 +542,14 @@ class VersionSet::Builder { if (levels_[level].deleted_files.count(f->number) > 0) { // File is deleted: do nothing } else { + std::vector* files = &v->files_[level]; + if (level > 0 && !files->empty()) { + // Must not overlap + assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest, + f->smallest) < 0); + } f->refs++; - v->files_[level].push_back(f); + files->push_back(f); } } }; @@ -749,7 +934,7 @@ int64_t VersionSet::NumLevelBytes(int level) const { int64_t VersionSet::MaxNextLevelOverlappingBytes() { int64_t result = 0; std::vector overlaps; - for (int level = 0; level < config::kNumLevels - 1; level++) { + for (int level = 1; level < config::kNumLevels - 1; level++) { for (size_t i = 0; i < current_->files_[level].size(); i++) { const FileMetaData* f = current_->files_[level][i]; GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); @@ -854,31 +1039,43 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { } Compaction* VersionSet::PickCompaction() { - if (!NeedsCompaction()) { + Compaction* c; + int level; + + // We prefer compactions triggered by too much data in a level over + // the compactions triggered by seeks. + const bool size_compaction = (current_->compaction_score_ >= 1); + const bool seek_compaction = (current_->file_to_compact_ != NULL); + if (size_compaction) { + level = current_->compaction_level_; + assert(level >= 0); + assert(level+1 < config::kNumLevels); + c = new Compaction(level); + + // Pick the first file that comes after compact_pointer_[level] + for (size_t i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (compact_pointer_[level].empty() || + icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { + c->inputs_[0].push_back(f); + break; + } + } + if (c->inputs_[0].empty()) { + // Wrap-around to the beginning of the key space + c->inputs_[0].push_back(current_->files_[level][0]); + } + } else if (seek_compaction) { + level = current_->file_to_compact_level_; + c = new Compaction(level); + c->inputs_[0].push_back(current_->file_to_compact_); + } else { return NULL; } - const int level = current_->compaction_level_; - assert(level >= 0); - assert(level+1 < config::kNumLevels); - Compaction* c = new Compaction(level); c->input_version_ = current_; c->input_version_->Ref(); - // Pick the first file that comes after compact_pointer_[level] - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (compact_pointer_[level].empty() || - icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { - c->inputs_[0].push_back(f); - break; - } - } - if (c->inputs_[0].empty()) { - // Wrap-around to the beginning of the key space - c->inputs_[0].push_back(current_->files_[level][0]); - } - // Files in level 0 may overlap each other, so pick up all overlapping ones if (level == 0) { InternalKey smallest, largest; diff --git a/db/version_set.h b/db/version_set.h index 2bac5e2..f00c35a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -35,6 +35,21 @@ class Version; class VersionSet; class WritableFile; +// Return the smallest index i such that files[i]->largest >= key. +// Return files.size() if there is no such file. +// REQUIRES: "files" contains a sorted list of non-overlapping files. +extern int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key); + +// Returns true iff some file in "files" overlaps some part of +// [smallest,largest]. +extern bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + const std::vector& files, + const InternalKey& smallest, + const InternalKey& largest); + class Version { public: // Append to *iters a sequence of iterators that will @@ -42,11 +57,34 @@ class Version { // REQUIRES: This version has been saved (see VersionSet::SaveTo) void AddIterators(const ReadOptions&, std::vector* iters); + // Lookup the value for key. If found, store it in *val and + // return OK. Else return a non-OK status. Fills *stats. + // REQUIRES: lock is not held + struct GetStats { + FileMetaData* seek_file; + int seek_file_level; + }; + Status Get(const ReadOptions&, const LookupKey& key, std::string* val, + GetStats* stats); + + // Adds "stats" into the current state. Returns true if a new + // compaction may need to be triggered, false otherwise. + // REQUIRES: lock is held + bool UpdateStats(const GetStats& stats); + // Reference count management (so Versions do not disappear out from // under live iterators) void Ref(); void Unref(); + // Returns true iff some file in the specified level overlaps + // some part of [smallest,largest]. + bool OverlapInLevel(int level, + const InternalKey& smallest, + const InternalKey& largest); + + int NumFiles(int level) const { return files_[level].size(); } + // Return a human readable string that describes this version's contents. std::string DebugString() const; @@ -65,6 +103,10 @@ class Version { // List of files per level std::vector files_[config::kNumLevels]; + // Next file to compact based on seek stats. + FileMetaData* file_to_compact_; + int file_to_compact_level_; + // Level that should be compacted next and its compaction score. // Score < 1 means compaction is not strictly needed. These fields // are initialized by Finalize(). @@ -73,6 +115,8 @@ class Version { explicit Version(VersionSet* vset) : vset_(vset), next_(this), prev_(this), refs_(0), + file_to_compact_(NULL), + file_to_compact_level_(-1), compaction_score_(-1), compaction_level_(-1) { } @@ -158,7 +202,10 @@ class VersionSet { Iterator* MakeInputIterator(Compaction* c); // Returns true iff some level needs a compaction. - bool NeedsCompaction() const { return current_->compaction_score_ >= 1; } + bool NeedsCompaction() const { + Version* v = current_; + return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL); + } // Add all files listed in any live version to *live. // May also mutate some internal state. diff --git a/port/port_posix.h b/port/port_posix.h index c158db1..d0b0615 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -9,6 +9,9 @@ #include #include +#ifdef SNAPPY +#include +#endif #include #include #include @@ -72,15 +75,30 @@ class AtomicPointer { } }; -// TODO(gabor): Implement actual compress inline bool Snappy_Compress(const char* input, size_t input_length, std::string* output) { +#ifdef SNAPPY + output->resize(snappy::MaxCompressedLength(input_length)); + size_t outlen; + snappy::RawCompress(input, input_length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#endif + return false; } -// TODO(gabor): Implement actual uncompress inline bool Snappy_Uncompress(const char* input_data, size_t input_length, std::string* output) { +#ifdef SNAPPY + size_t ulength; + if (!snappy::GetUncompressedLength(input_data, ulength, &ulength)) { + return false; + } + output->resize(ulength); + return snappy::RawUncompress(input_data, input_length, &(*output)[0]); +#endif + return false; } diff --git a/table/table_test.cc b/table/table_test.cc index cf2bae0..10d08fc 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -727,11 +727,15 @@ TEST(Harness, RandomizedLongDB) { Test(&rnd); // We must have created enough data to force merging - std::string l0_files, l1_files; - ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files)); - ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files)); - ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0); - + int files = 0; + for (int level = 0; level < config::kNumLevels; level++) { + std::string value; + char name[100]; + snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); + ASSERT_TRUE(db()->GetProperty(name, &value)); + files += atoi(value.c_str()); + } + ASSERT_GT(files, 0); } class MemTableTest { }; -- cgit v1.2.3 From 70b7ec1b3b394d7b79291edf4928d1094b6a760d Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Wed, 22 Jun 2011 18:45:39 +0000 Subject: Fixing issue 11: version_set_test.cc was missing git-svn-id: http://leveldb.googlecode.com/svn/trunk@33 62dab493-f737-651d-591e-8d6aee1b9529 --- db/version_set_test.cc | 115 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 db/version_set_test.cc diff --git a/db/version_set_test.cc b/db/version_set_test.cc new file mode 100644 index 0000000..eae2a80 --- /dev/null +++ b/db/version_set_test.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +class FindFileTest { + public: + std::vector files_; + + ~FindFileTest() { + for (int i = 0; i < files_.size(); i++) { + delete files_[i]; + } + } + + void Add(const char* smallest, const char* largest) { + FileMetaData* f = new FileMetaData; + f->number = files_.size() + 1; + f->smallest = InternalKey(smallest, 100, kTypeValue); + f->largest = InternalKey(largest, 100, kTypeValue); + files_.push_back(f); + } + + int Find(const char* key) { + InternalKey target(key, 100, kTypeValue); + InternalKeyComparator cmp(BytewiseComparator()); + return FindFile(cmp, files_, target.Encode()); + } + + bool Overlaps(const char* smallest, const char* largest) { + InternalKey s(smallest, 100, kTypeValue); + InternalKey l(largest, 100, kTypeValue); + InternalKeyComparator cmp(BytewiseComparator()); + return SomeFileOverlapsRange(cmp, files_, s, l); + } +}; + +TEST(FindFileTest, Empty) { + ASSERT_EQ(0, Find("foo")); + ASSERT_TRUE(! Overlaps("a", "z")); +} + +TEST(FindFileTest, Single) { + Add("p", "q"); + ASSERT_EQ(0, Find("a")); + ASSERT_EQ(0, Find("p")); + ASSERT_EQ(0, Find("p1")); + ASSERT_EQ(0, Find("q")); + ASSERT_EQ(1, Find("q1")); + ASSERT_EQ(1, Find("z")); + + ASSERT_TRUE(! Overlaps("a", "b")); + ASSERT_TRUE(! Overlaps("z1", "z2")); + ASSERT_TRUE(Overlaps("a", "p")); + ASSERT_TRUE(Overlaps("a", "q")); + ASSERT_TRUE(Overlaps("a", "z")); + ASSERT_TRUE(Overlaps("p", "p1")); + ASSERT_TRUE(Overlaps("p", "q")); + ASSERT_TRUE(Overlaps("p", "z")); + ASSERT_TRUE(Overlaps("p1", "p2")); + ASSERT_TRUE(Overlaps("p1", "z")); + ASSERT_TRUE(Overlaps("q", "q")); + ASSERT_TRUE(Overlaps("q", "q1")); +} + + +TEST(FindFileTest, Multiple) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_EQ(0, Find("100")); + ASSERT_EQ(0, Find("150")); + ASSERT_EQ(0, Find("151")); + ASSERT_EQ(0, Find("199")); + ASSERT_EQ(0, Find("200")); + ASSERT_EQ(1, Find("201")); + ASSERT_EQ(1, Find("249")); + ASSERT_EQ(1, Find("250")); + ASSERT_EQ(2, Find("251")); + ASSERT_EQ(2, Find("299")); + ASSERT_EQ(2, Find("300")); + ASSERT_EQ(2, Find("349")); + ASSERT_EQ(2, Find("350")); + ASSERT_EQ(3, Find("351")); + ASSERT_EQ(3, Find("400")); + ASSERT_EQ(3, Find("450")); + ASSERT_EQ(4, Find("451")); + + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("251", "299")); + ASSERT_TRUE(! Overlaps("451", "500")); + ASSERT_TRUE(! Overlaps("351", "399")); + + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); +} + +} + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} -- cgit v1.2.3 From 9cb7b73e2668626608a4a8706d69d3e7000cebfc Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Wed, 29 Jun 2011 00:30:50 +0000 Subject: Platform detection during build, plus compatibility patches for machines without . This revision adds two major changes: 1. build_detect_platform which generates build_config.mk with platform-dependent flags for the build process 2. /port/atomic_pointer.h with anAtomicPointerimplementation for platforms without Some of this code is loosely based on patches submitted to the LevelDB mailing list at https://groups.google.com/forum/#!forum/leveldb Tip of the hat to Dave Smith and Edouard A, who both sent patches. The presence of Snappy (http://code.google.com/p/snappy/) and cstdatomic are now both detected in the build_detect_platform script (1.) which gets executing during make. For (2.), instead of broadly importing atomicops_* from Chromium or the Google performance tools, we chose to just implement AtomicPointer and the limited atomic load and store operations it needs. This resulted in much less code and fewer files - everything is contained in atomic_pointer.h. git-svn-id: http://leveldb.googlecode.com/svn/trunk@34 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 55 ++++++------- build_detect_platform | 69 ++++++++++++++++ port/atomic_pointer.h | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++ port/port.h | 2 - port/port_posix.h | 63 ++++++++------- util/cache.cc | 153 ++++++++++++++++++++++-------------- 6 files changed, 431 insertions(+), 124 deletions(-) create mode 100644 build_detect_platform create mode 100644 port/atomic_pointer.h diff --git a/Makefile b/Makefile index 84f77ab..0537762 100644 --- a/Makefile +++ b/Makefile @@ -13,37 +13,32 @@ OPT = -O2 -DNDEBUG # (A) Production use (optimized mode) # OPT = -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols #----------------------------------------------- +# detect what platform we're building on +$(shell sh ./build_detect_platform) +# this file is generated by build_detect_platform to set build flags +include build_config.mk -UNAME := $(shell uname) - -ifeq ($(UNAME), Darwin) -# To build for iOS, set PLATFORM=IOS. -ifndef PLATFORM -PLATFORM=OSX -endif # PLATFORM -PLATFORM_CFLAGS = -DLEVELDB_PLATFORM_OSX -PORT_MODULE = port_osx.o -else # UNAME -PLATFORM_CFLAGS = -DLEVELDB_PLATFORM_POSIX -std=c++0x -PORT_MODULE = port_posix.o -endif # UNAME - -# Set 'SNAPPY' to 1 if you have the Snappy compression library -# installed and want to enable its use in LevelDB +# If Snappy is installed, add compilation and linker flags # (see http://code.google.com/p/snappy/) -SNAPPY=0 - -ifeq ($(SNAPPY), 0) +ifeq ($(SNAPPY), 1) +SNAPPY_CFLAGS=-DSNAPPY +SNAPPY_LDFLAGS=-lsnappy +else SNAPPY_CFLAGS= SNAPPY_LDFLAGS= +endif + +# If Google Perf Tools are installed, add compilation and linker flags +# (see http://code.google.com/p/google-perftools/) +ifeq ($(GOOGLE_PERFTOOLS), 1) +GOOGLE_PERFTOOLS_LDFLAGS=-ltcmalloc else -SNAPPY_CFLAGS=-DSNAPPY -SNAPPY_LDFLAGS=-lsnappy +GOOGLE_PERFTOOLS_LDFLAGS= endif -CFLAGS = -c -I. -I./include $(PLATFORM_CFLAGS) $(OPT) $(SNAPPY_CFLAGS) +CFLAGS = -c -I. -I./include $(PORT_CFLAGS) $(PLATFORM_CCFLAGS) $(OPT) $(SNAPPY_CFLAGS) -LDFLAGS=-lpthread $(SNAPPY_LDFLAGS) +LDFLAGS=$(PLATFORM_LDFLAGS) $(SNAPPY_LDFLAGS) $(GOOGLE_PERFTOOLS_LDFLAGS) LIBOBJECTS = \ ./db/builder.o \ @@ -59,7 +54,7 @@ LIBOBJECTS = \ ./db/version_edit.o \ ./db/version_set.o \ ./db/write_batch.o \ - ./port/$(PORT_MODULE) \ + ./port/port_posix.o \ ./table/block.o \ ./table/block_builder.o \ ./table/format.o \ @@ -105,19 +100,15 @@ PROGRAMS = db_bench $(TESTS) LIBRARY = libleveldb.a -ifeq ($(PLATFORM), IOS) -# Only XCode can build executable applications for iOS. all: $(LIBRARY) -else -all: $(PROGRAMS) $(LIBRARY) -endif -check: $(TESTS) +check: $(PROGRAMS) $(TESTS) for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done clean: -rm -f $(PROGRAMS) $(LIBRARY) */*.o ios-x86/*/*.o ios-arm/*/*.o - -rmdir -p ios-x86/* ios-arm/* + -rm -rf ios-x86/* ios-arm/* + -rm build_config.mk $(LIBRARY): $(LIBOBJECTS) rm -f $@ @@ -188,5 +179,3 @@ else $(CC) $(CFLAGS) $< -o $@ endif -# TODO(gabor): dependencies for .o files -# TODO(gabor): Build library diff --git a/build_detect_platform b/build_detect_platform new file mode 100644 index 0000000..f23068a --- /dev/null +++ b/build_detect_platform @@ -0,0 +1,69 @@ +#!/bin/sh + +# Detects OS we're compiling on and generates build_config.mk, +# which in turn gets read while processing Makefile. + +# build_config.mk will set the following variables: +# - PORT_CFLAGS will either set: +# -DLEVELDB_PLATFORM_POSIX if cstatomic is present +# -DLEVELDB_PLATFORM_NOATOMIC if it is not +# - PLATFORM_CFLAGS with compiler flags for the platform +# - PLATFORM_LDFLAGS with linker flags for the platform + +# Delete existing build_config.mk +rm -f build_config.mk + +# Detect OS +case `uname -s` in + Darwin) + PLATFORM=OS_MACOSX + echo "PLATFORM_CFLAGS=-pthread -DOS_MACOSX" >> build_config.mk + echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk + ;; + Linux) + PLATFORM=OS_LINUX + echo "PLATFORM_CFLAGS=-pthread -DOS_LINUX" >> build_config.mk + echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk + ;; + SunOS) + PLATFORM=OS_SOLARIS + echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_SOLARIS" >> build_config.mk + echo "PLATFORM_LDFLAGS=-lpthread -lrt" >> build_config.mk + ;; + *) + echo "Unknown platform!" + exit 1 +esac + +echo "PLATFORM=$PLATFORM" >> build_config.mk + +# On GCC, use libc's memcmp, not GCC's memcmp +PORT_CFLAGS="-fno-builtin-memcmp" + +# Detect C++0x -- this determines whether we'll use port_noatomic.h +# or port_posix.h by: +# 1. Rrying to compile with -std=c++0x and including . +# 2. If g++ returns error code, we know to use port_posix.h +g++ $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF +if [ "$?" = 0 ]; then + PORT_CFLAGS+=" -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT -std=c++0x" +else + PORT_CFLAGS+=" -DLEVELDB_PLATFORM_POSIX" +fi + +# Test whether Snappy library is installed +# http://code.google.com/p/snappy/ +g++ $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF +if [ "$?" = 0 ]; then + echo "SNAPPY=1" >> build_config.mk +else + echo "SNAPPY=0" >> build_config.mk +fi + +echo "PORT_CFLAGS=$PORT_CFLAGS" >> build_config.mk diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h new file mode 100644 index 0000000..3bae007 --- /dev/null +++ b/port/atomic_pointer.h @@ -0,0 +1,213 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// AtomicPointer provides storage for a lock-free pointer. +// Platform-dependent implementation of AtomicPointer: +// - If cstdatomic is present (on newer versions of gcc, it is), we use +// a cstdatomic-based AtomicPointer +// - If it is not, we define processor-dependent AtomicWord operations, +// and then use them to build AtomicPointer +// +// This code is based on atomicops-internals-* in Google's perftools: +// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase + +#ifndef PORT_ATOMIC_POINTER_H_ +#define PORT_ATOMIC_POINTER_H_ + +#ifdef LEVELDB_CSTDATOMIC_PRESENT + +/////////////////////////////////////////////////////////////////////////////// +// WE HAVE +// Use a -based AtomicPointer + +#include +#include + +namespace leveldb { +namespace port { + +// Storage for a lock-free pointer +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +} // namespace leveldb::port +} // namespace leveldb + +#else +/////////////////////////////////////////////////////////////////////////////// +// NO +// The entire rest of this file covers that case + +#if defined(_M_X64) || defined(__x86_64__) +#define ARCH_CPU_X86_FAMILY 1 +#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) +#define ARCH_CPU_X86_FAMILY 1 +#elif defined(__ARMEL__) +#define ARCH_CPU_ARM_FAMILY 1 +#else +#warning Please add support for your architecture in atomicpointer.h +#endif + +namespace leveldb { +namespace port { +namespace internal { + +// AtomicWord is a machine-sized pointer. +typedef intptr_t AtomicWord; + +} // namespace leveldb::port::internal +} // namespace leveldb::port +} // namespace leveldb + +// Include our platform specific implementation. +/////////////////////////////////////////////////////////////////////////////// +// Windows on x86 +#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY) + +// void MemoryBarrier(void) macro is defined in windows.h: +// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx +// Including windows.h here; MemoryBarrier() gets used below. +#include + +/////////////////////////////////////////////////////////////////////////////// +// Mac OS on x86 +#elif defined(OS_MACOSX) && defined(ARCH_CPU_X86_FAMILY) + +#include + +namespace leveldb { +namespace port { +namespace internal { + +inline void MemoryBarrier() { +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + __asm__ __volatile__("" : : : "memory"); +#else + OSMemoryBarrier(); +#endif +} + +} // namespace leveldb::port::internal +} // namespace leveldb::port +} // namespace leveldb + +/////////////////////////////////////////////////////////////////////////////// +// Any x86 CPU +#elif defined(ARCH_CPU_X86_FAMILY) + +namespace leveldb { +namespace port { +namespace internal { + +inline void MemoryBarrier() { + __asm__ __volatile__("" : : : "memory"); +} + +} // namespace leveldb::port::internal +} // namespace leveldb::port +} // namespace leveldb + +#undef ATOMICOPS_COMPILER_BARRIER + +/////////////////////////////////////////////////////////////////////////////// +// ARM +#elif defined(ARCH_CPU_ARM_FAMILY) + +namespace leveldb { +namespace port { +namespace internal { + +typedef void (*LinuxKernelMemoryBarrierFunc)(void); +LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier __attribute__((weak)) = + (LinuxKernelMemoryBarrierFunc) 0xffff0fa0; + +inline void MemoryBarrier() { + pLinuxKernelMemoryBarrier(); +} + +} // namespace leveldb::port::internal +} // namespace leveldb::port +} // namespace leveldb + +#else +#error "Atomic operations are not supported on your platform" +#endif + +/////////////////////////////////////////////////////////////////////////////// +// Implementation of AtomicPointer based on MemoryBarriers above + +namespace leveldb { +namespace port { +namespace internal { + +// Atomic operations using per-system MemoryBarrier()s + +inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) { + AtomicWord value = *ptr; + MemoryBarrier(); + return value; +} + +inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) { + MemoryBarrier(); + *ptr = value; +} + +inline AtomicWord NoBarrier_Load(volatile const AtomicWord* ptr) { + return *ptr; +} + +inline void NoBarrier_Store(volatile AtomicWord* ptr, AtomicWord value) { + *ptr = value; +} + +} // namespace leveldb::port::internal + +// AtomicPointer definition for systems without . +class AtomicPointer { + private: + typedef internal::AtomicWord Rep; + Rep rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} + inline void* Acquire_Load() const { + return reinterpret_cast(internal::Acquire_Load(&rep_)); + } + inline void Release_Store(void* v) { + internal::Release_Store(&rep_, reinterpret_cast(v)); + } + inline void* NoBarrier_Load() const { + return reinterpret_cast(internal::NoBarrier_Load(&rep_)); + } + inline void NoBarrier_Store(void* v) { + internal::NoBarrier_Store(&rep_, reinterpret_cast(v)); + } +}; + +} // namespace leveldb::port +} // namespace leveldb + +#endif // LEVELDB_CSTDATOMIC_PRESENT + +#endif // PORT_ATOMIC_POINTER_H_ diff --git a/port/port.h b/port/port.h index e35db23..816826b 100644 --- a/port/port.h +++ b/port/port.h @@ -16,8 +16,6 @@ # include "port/port_chromium.h" #elif defined(LEVELDB_PLATFORM_ANDROID) # include "port/port_android.h" -#elif defined(LEVELDB_PLATFORM_OSX) -# include "port/port_osx.h" #endif #endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_posix.h b/port/port_posix.h index d0b0615..3f329f0 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -7,20 +7,46 @@ #ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ #define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ -#include +#if defined(OS_MACOSX) + #include +#elif defined(OS_SOLARIS) + #include + #ifdef _LITTLE_ENDIAN + #define LITTLE_ENDIAN + #else + #define BIG_ENDIAN + #endif +#else + #include +#endif #include #ifdef SNAPPY #include #endif #include #include -#include -#include +#include "port/atomic_pointer.h" + +#ifdef LITTLE_ENDIAN +#define IS_LITTLE_ENDIAN true +#else +#define IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) +#endif + +#if defined(OS_MACOSX) || defined(OS_SOLARIS) +#define fread_unlocked fread +#define fwrite_unlocked fwrite +#define fflush_unlocked fflush +#endif + +#if defined(OS_MACOSX) +#define fdatasync fsync +#endif namespace leveldb { namespace port { -static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN); +static const bool kLittleEndian = IS_LITTLE_ENDIAN; class CondVar; @@ -54,29 +80,8 @@ class CondVar { Mutex* mu_; }; -// Storage for a lock-free pointer -class AtomicPointer { - private: - std::atomic rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); - } - inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); - } - inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); - } - inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); - } -}; - inline bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { + ::std::string* output) { #ifdef SNAPPY output->resize(snappy::MaxCompressedLength(input_length)); size_t outlen; @@ -89,7 +94,7 @@ inline bool Snappy_Compress(const char* input, size_t input_length, } inline bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { + ::std::string* output) { #ifdef SNAPPY size_t ulength; if (!snappy::GetUncompressedLength(input_data, ulength, &ulength)) { @@ -106,7 +111,7 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } -} -} +} // namespace port +} // namespace leveldb #endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/util/cache.cc b/util/cache.cc index 968e6a0..5829b79 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -2,17 +2,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) -#include -#elif defined(LEVELDB_PLATFORM_OSX) -#include -#elif defined(LEVELDB_PLATFORM_CHROMIUM) -#include "base/hash_tables.h" -#else -#include // TODO(sanjay): Switch to unordered_set when possible. -#endif - #include +#include +#include #include "leveldb/cache.h" #include "port/port.h" @@ -33,6 +25,7 @@ namespace { struct LRUHandle { void* value; void (*deleter)(const Slice&, void* value); + LRUHandle* next_hash; LRUHandle* next; LRUHandle* prev; size_t charge; // TODO(opt): Only allow uint32_t? @@ -51,43 +44,93 @@ struct LRUHandle { } }; -// Pick a platform specific hash_set instantiation -#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN) - // Microsoft's hash_set deviates from the standard. See - // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx - // for details. Basically the 2 param () operator is a less than and - // the 1 param () operator is a hash function. - struct HandleHashCompare : public stdext::hash_compare { - size_t operator() (LRUHandle* h) const { - Slice k = h->key(); - return Hash(k.data(), k.size(), 0); +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class HandleTable { + public: + HandleTable() : length_(0), elems_(0), list_(NULL) { Resize(); } + ~HandleTable() { delete[] list_; } + + LRUHandle* Lookup(LRUHandle* h) { + return *FindPointer(h); + } + + LRUHandle* Insert(LRUHandle* h) { + LRUHandle** ptr = FindPointer(h); + LRUHandle* old = *ptr; + h->next_hash = (old == NULL ? NULL : old->next_hash); + *ptr = h; + if (old == NULL) { + ++elems_; + if (elems_ > length_) { + // Since each cache entry is fairly large, we aim for a small + // average linked list length (<= 1). + Resize(); + } } - bool operator() (LRUHandle* a, LRUHandle* b) const { - return a->key().compare(b->key()) < 0; + return old; + } + + LRUHandle* Remove(LRUHandle* h) { + LRUHandle** ptr = FindPointer(h); + LRUHandle* result = *ptr; + if (result != NULL) { + *ptr = result->next_hash; + --elems_; } - }; - typedef base::hash_set HandleTable; -#else - struct HandleHash { - inline size_t operator()(LRUHandle* h) const { - Slice k = h->key(); - return Hash(k.data(), k.size(), 0); + return result; + } + + private: + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + uint32_t length_; + uint32_t elems_; + LRUHandle** list_; + + // Return a pointer to slot that points to a cache entry that + // matches *h. If there is no such cache entry, return a pointer to + // the trailing slot in the corresponding linked list. + LRUHandle** FindPointer(LRUHandle* h) { + Slice key = h->key(); + uint32_t hash = Hash(key.data(), key.size(), 0); + LRUHandle** ptr = &list_[hash & (length_ - 1)]; + while (*ptr != NULL && key != (*ptr)->key()) { + ptr = &(*ptr)->next_hash; } - }; + return ptr; + } - struct HandleEq { - inline bool operator()(LRUHandle* a, LRUHandle* b) const { - return a->key() == b->key(); + void Resize() { + uint32_t new_length = 4; + while (new_length < elems_) { + new_length *= 2; + } + LRUHandle** new_list = new LRUHandle*[new_length]; + memset(new_list, 0, sizeof(new_list[0]) * new_length); + uint32_t count = 0; + for (int i = 0; i < length_; i++) { + LRUHandle* h = list_[i]; + while (h != NULL) { + LRUHandle* next = h->next_hash; + Slice key = h->key(); + uint32_t hash = Hash(key.data(), key.size(), 0); + LRUHandle** ptr = &new_list[hash & (new_length - 1)]; + h->next_hash = *ptr; + *ptr = h; + h = next; + count++; + } } - }; -# if defined(LEVELDB_PLATFORM_CHROMIUM) - typedef base::hash_set HandleTable; -# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID) - typedef std::unordered_set HandleTable; -# else - typedef __gnu_cxx::hash_set HandleTable; -# endif -#endif + assert(elems_ == count); + delete[] list_; + list_ = new_list; + length_ = new_length; + } +}; class LRUCache : public Cache { public: @@ -132,7 +175,6 @@ LRUCache::LRUCache(size_t capacity) } LRUCache::~LRUCache() { - table_.clear(); for (LRUHandle* e = lru_.next; e != &lru_; ) { LRUHandle* next = e->next; assert(e->refs == 1); // Error if caller has an unreleased handle @@ -170,16 +212,13 @@ Cache::Handle* LRUCache::Lookup(const Slice& key) { LRUHandle dummy; dummy.next = &dummy; dummy.value = const_cast(&key); - HandleTable::iterator iter = table_.find(&dummy); - if (iter == table_.end()) { - return NULL; - } else { - LRUHandle* e = const_cast(*iter); + LRUHandle* e = table_.Lookup(&dummy); + if (e != NULL) { e->refs++; LRU_Remove(e); LRU_Append(e); - return reinterpret_cast(e); } + return reinterpret_cast(e); } void* LRUCache::Value(Handle* handle) { @@ -206,20 +245,16 @@ Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge, LRU_Append(e); usage_ += charge; - std::pair p = table_.insert(e); - if (!p.second) { - // Kill existing entry - LRUHandle* old = const_cast(*(p.first)); + LRUHandle* old = table_.Insert(e); + if (old != NULL) { LRU_Remove(old); - table_.erase(p.first); - table_.insert(e); Unref(old); } while (usage_ > capacity_ && lru_.next != &lru_) { LRUHandle* old = lru_.next; LRU_Remove(old); - table_.erase(old); + table_.Remove(old); Unref(old); } @@ -232,11 +267,9 @@ void LRUCache::Erase(const Slice& key) { LRUHandle dummy; dummy.next = &dummy; dummy.value = const_cast(&key); - HandleTable::iterator iter = table_.find(&dummy); - if (iter != table_.end()) { - LRUHandle* e = const_cast(*iter); + LRUHandle* e = table_.Remove(&dummy); + if (e != NULL) { LRU_Remove(e); - table_.erase(iter); Unref(e); } } -- cgit v1.2.3 From bb1914e5b77bd14283e80d411feccffa26488003 Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Wed, 29 Jun 2011 22:53:17 +0000 Subject: Fixing Makefile issue reported in Issue 15 (misspelled flag) git-svn-id: http://leveldb.googlecode.com/svn/trunk@35 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 3 +- build_detect_platform | 8 ++-- port/port_osx.cc | 50 -------------------- port/port_osx.h | 125 -------------------------------------------------- 4 files changed, 6 insertions(+), 180 deletions(-) delete mode 100644 port/port_osx.cc delete mode 100644 port/port_osx.h diff --git a/Makefile b/Makefile index 0537762..ca0dabc 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ else GOOGLE_PERFTOOLS_LDFLAGS= endif -CFLAGS = -c -I. -I./include $(PORT_CFLAGS) $(PLATFORM_CCFLAGS) $(OPT) $(SNAPPY_CFLAGS) +CFLAGS = -c -I. -I./include $(PORT_CFLAGS) $(PLATFORM_CFLAGS) $(OPT) $(SNAPPY_CFLAGS) LDFLAGS=$(PLATFORM_LDFLAGS) $(SNAPPY_LDFLAGS) $(GOOGLE_PERFTOOLS_LDFLAGS) @@ -168,6 +168,7 @@ ifeq ($(PLATFORM), IOS) SIMULATORROOT=/Developer/Platforms/iPhoneSimulator.platform/Developer DEVICEROOT=/Developer/Platforms/iPhoneOS.platform/Developer IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version CFBundleShortVersionString) + .cc.o: mkdir -p ios-x86/$(dir $@) $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ diff --git a/build_detect_platform b/build_detect_platform index f23068a..bb108f7 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -17,8 +17,8 @@ rm -f build_config.mk case `uname -s` in Darwin) PLATFORM=OS_MACOSX - echo "PLATFORM_CFLAGS=-pthread -DOS_MACOSX" >> build_config.mk - echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk + echo "PLATFORM_CFLAGS=-DOS_MACOSX" >> build_config.mk + echo "PLATFORM_LDFLAGS=" >> build_config.mk ;; Linux) PLATFORM=OS_LINUX @@ -49,9 +49,9 @@ g++ $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < -#include -#include -#include "util/logging.h" - -namespace leveldb { -namespace port { - -static void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } -} - -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } - -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } - -void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } - -void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } - -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); -} - -CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } - -void CondVar::Wait() { - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); -} - -void CondVar::Signal() { - PthreadCall("signal", pthread_cond_signal(&cv_)); -} - -void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); -} - -} -} diff --git a/port/port_osx.h b/port/port_osx.h deleted file mode 100644 index 5524c6c..0000000 --- a/port/port_osx.h +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_OSX_H_ -#define STORAGE_LEVELDB_PORT_PORT_OSX_H_ - -#include -#include -#include -#include - -#include - -namespace leveldb { - -// The following 4 methods implemented here for the benefit of env_posix.cc. -inline size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { - return fread(a, b, c, d); -} - -inline size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { - return fwrite(a, b, c, d); -} - -inline int fflush_unlocked(FILE *f) { - return fflush(f); -} - -inline int fdatasync(int fd) { - return fsync(fd); -} - -namespace port { - -static const bool kLittleEndian = (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN); - -// ------------------ Threading ------------------- - -// A Mutex represents an exclusive lock. -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld() { } - - private: - friend class CondVar; - pthread_mutex_t mu_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - - void Wait(); - void Signal(); - void SignalAll(); - - private: - pthread_cond_t cv_; - Mutex* mu_; -}; - -inline void MemoryBarrier() { -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) - // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on - // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. - __asm__ __volatile__("" : : : "memory"); -#else - OSMemoryBarrier(); -#endif -} - -class AtomicPointer { - private: - void* ptr_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* p) : ptr_(p) {} - inline void* Acquire_Load() const { - void* ptr = ptr_; - MemoryBarrier(); - return ptr; - } - inline void Release_Store(void* v) { - MemoryBarrier(); - ptr_ = v; - } - inline void* NoBarrier_Load() const { - return ptr_; - } - inline void NoBarrier_Store(void* v) { - ptr_ = v; - } -}; - -inline bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { - return false; -} - -inline bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { - return false; -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_OSX_H_ -- cgit v1.2.3 From e67cd63d8a779971247078b2715b6ae7b37dbc2e Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Thu, 30 Jun 2011 23:17:03 +0000 Subject: Fixed a snappy compression wrapper bug (passing wrong variable). Change atomic_pointer.h to prefer a memory barrier based implementation over a based implementation for the following reasons: (1) On a x86-32-bit gcc-4.4 build, was corrupting the AtomicPointer. (2) On a x86-64-bit gcc build, a based acquire-load takes ~15ns as opposed to the ~1ns for a memory-barrier based implementation. Fixes issue 9 (corruption_test fails) http://code.google.com/p/leveldb/issues/detail?id=9 Fixes issue 16 (CorruptionTest.MissingDescriptor fails) http://code.google.com/p/leveldb/issues/detail?id=16 git-svn-id: http://leveldb.googlecode.com/svn/trunk@36 62dab493-f737-651d-591e-8d6aee1b9529 --- port/atomic_pointer.h | 208 ++++++++++++++++---------------------------------- port/port_posix.h | 2 +- 2 files changed, 66 insertions(+), 144 deletions(-) diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h index 3bae007..7659840 100644 --- a/port/atomic_pointer.h +++ b/port/atomic_pointer.h @@ -4,57 +4,31 @@ // AtomicPointer provides storage for a lock-free pointer. // Platform-dependent implementation of AtomicPointer: +// - If the platform provides a cheap barrier, we use it with raw pointers // - If cstdatomic is present (on newer versions of gcc, it is), we use -// a cstdatomic-based AtomicPointer -// - If it is not, we define processor-dependent AtomicWord operations, -// and then use them to build AtomicPointer -// +// a cstdatomic-based AtomicPointer. However we prefer the memory +// barrier based version, because at least on a gcc 4.4 32-bit build +// on linux, we have encountered a buggy +// implementation. Also, some implementations are much +// slower than a memory-barrier based implementation (~16ns for +// based acquire-load vs. ~1ns for a barrier based +// acquire-load). // This code is based on atomicops-internals-* in Google's perftools: // http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase #ifndef PORT_ATOMIC_POINTER_H_ #define PORT_ATOMIC_POINTER_H_ +#include #ifdef LEVELDB_CSTDATOMIC_PRESENT - -/////////////////////////////////////////////////////////////////////////////// -// WE HAVE -// Use a -based AtomicPointer - #include -#include - -namespace leveldb { -namespace port { - -// Storage for a lock-free pointer -class AtomicPointer { - private: - std::atomic rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); - } - inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); - } - inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); - } - inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); - } -}; - -} // namespace leveldb::port -} // namespace leveldb - -#else -/////////////////////////////////////////////////////////////////////////////// -// NO -// The entire rest of this file covers that case +#endif +#ifdef OS_WIN +#include +#endif +#ifdef OS_MACOSX +#include +#endif #if defined(_M_X64) || defined(__x86_64__) #define ARCH_CPU_X86_FAMILY 1 @@ -62,152 +36,100 @@ class AtomicPointer { #define ARCH_CPU_X86_FAMILY 1 #elif defined(__ARMEL__) #define ARCH_CPU_ARM_FAMILY 1 -#else -#warning Please add support for your architecture in atomicpointer.h #endif namespace leveldb { namespace port { -namespace internal { - -// AtomicWord is a machine-sized pointer. -typedef intptr_t AtomicWord; - -} // namespace leveldb::port::internal -} // namespace leveldb::port -} // namespace leveldb -// Include our platform specific implementation. -/////////////////////////////////////////////////////////////////////////////// +// Define MemoryBarrier() if available // Windows on x86 #if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY) - -// void MemoryBarrier(void) macro is defined in windows.h: +// windows.h already provides a MemoryBarrier(void) macro // http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx -// Including windows.h here; MemoryBarrier() gets used below. -#include - -/////////////////////////////////////////////////////////////////////////////// -// Mac OS on x86 -#elif defined(OS_MACOSX) && defined(ARCH_CPU_X86_FAMILY) - -#include - -namespace leveldb { -namespace port { -namespace internal { +#define LEVELDB_HAVE_MEMORY_BARRIER +// Gcc on x86 +#elif defined(__GNUC__) && defined(ARCH_CPU_X86_FAMILY) inline void MemoryBarrier() { -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. __asm__ __volatile__("" : : : "memory"); -#else - OSMemoryBarrier(); -#endif } +#define LEVELDB_HAVE_MEMORY_BARRIER -} // namespace leveldb::port::internal -} // namespace leveldb::port -} // namespace leveldb - -/////////////////////////////////////////////////////////////////////////////// -// Any x86 CPU -#elif defined(ARCH_CPU_X86_FAMILY) - -namespace leveldb { -namespace port { -namespace internal { - +// Mac OS +#elif defined(OS_MACOSX) inline void MemoryBarrier() { - __asm__ __volatile__("" : : : "memory"); + OSMemoryBarrier(); } +#define LEVELDB_HAVE_MEMORY_BARRIER -} // namespace leveldb::port::internal -} // namespace leveldb::port -} // namespace leveldb - -#undef ATOMICOPS_COMPILER_BARRIER - -/////////////////////////////////////////////////////////////////////////////// // ARM #elif defined(ARCH_CPU_ARM_FAMILY) - -namespace leveldb { -namespace port { -namespace internal { - typedef void (*LinuxKernelMemoryBarrierFunc)(void); LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier __attribute__((weak)) = (LinuxKernelMemoryBarrierFunc) 0xffff0fa0; - inline void MemoryBarrier() { pLinuxKernelMemoryBarrier(); } +#define LEVELDB_HAVE_MEMORY_BARRIER -} // namespace leveldb::port::internal -} // namespace leveldb::port -} // namespace leveldb - -#else -#error "Atomic operations are not supported on your platform" #endif -/////////////////////////////////////////////////////////////////////////////// -// Implementation of AtomicPointer based on MemoryBarriers above - -namespace leveldb { -namespace port { -namespace internal { - -// Atomic operations using per-system MemoryBarrier()s - -inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) { - AtomicWord value = *ptr; - MemoryBarrier(); - return value; -} - -inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) { - MemoryBarrier(); - *ptr = value; -} - -inline AtomicWord NoBarrier_Load(volatile const AtomicWord* ptr) { - return *ptr; -} - -inline void NoBarrier_Store(volatile AtomicWord* ptr, AtomicWord value) { - *ptr = value; -} - -} // namespace leveldb::port::internal +// AtomicPointer built using platform-specific MemoryBarrier() +#if defined(LEVELDB_HAVE_MEMORY_BARRIER) +class AtomicPointer { + private: + void* rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(p) {} + inline void* NoBarrier_Load() const { return rep_; } + inline void NoBarrier_Store(void* v) { rep_ = v; } + inline void* Acquire_Load() const { + void* result = rep_; + MemoryBarrier(); + return result; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + rep_ = v; + } +}; -// AtomicPointer definition for systems without . +// AtomicPointer based on +#elif defined(LEVELDB_CSTDATOMIC_PRESENT) class AtomicPointer { private: - typedef internal::AtomicWord Rep; - Rep rep_; + std::atomic rep_; public: AtomicPointer() { } - explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} + explicit AtomicPointer(void* v) : rep_(v) { } inline void* Acquire_Load() const { - return reinterpret_cast(internal::Acquire_Load(&rep_)); + return rep_.load(std::memory_order_acquire); } inline void Release_Store(void* v) { - internal::Release_Store(&rep_, reinterpret_cast(v)); + rep_.store(v, std::memory_order_release); } inline void* NoBarrier_Load() const { - return reinterpret_cast(internal::NoBarrier_Load(&rep_)); + return rep_.load(std::memory_order_relaxed); } inline void NoBarrier_Store(void* v) { - internal::NoBarrier_Store(&rep_, reinterpret_cast(v)); + rep_.store(v, std::memory_order_relaxed); } }; +// We have neither MemoryBarrier(), nor +#else +#error Please implement AtomicPointer for this platform. + +#endif + +#undef LEVELDB_HAVE_MEMORY_BARRIER +#undef ARCH_CPU_X86_FAMILY +#undef ARCH_CPU_ARM_FAMILY + } // namespace leveldb::port } // namespace leveldb -#endif // LEVELDB_CSTDATOMIC_PRESENT - #endif // PORT_ATOMIC_POINTER_H_ diff --git a/port/port_posix.h b/port/port_posix.h index 3f329f0..2995026 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -97,7 +97,7 @@ inline bool Snappy_Uncompress(const char* input_data, size_t input_length, ::std::string* output) { #ifdef SNAPPY size_t ulength; - if (!snappy::GetUncompressedLength(input_data, ulength, &ulength)) { + if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { return false; } output->resize(ulength); -- cgit v1.2.3 From 917b88dd720b6e658c1fd7812bc61c605f315124 Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Fri, 15 Jul 2011 00:20:57 +0000 Subject: Small tweaks and bugfixes for Issue 18 and 19. Slight tweak to the no-overlap optimization: only push to level 2 to reduce the amount of wasted space when the same small key range is being repeatedly overwritten. Fix for Issue 18: Avoid failure on Windows by avoiding deletion of lock file until the end of DestroyDB(). Fix for Issue 19: Disregard sequence numbers when checking for overlap in sstable ranges. This fixes issue 19: when writing the same key over and over again, we would generate a sequence of sstables that were never merged together since their sequence numbers were disjoint. Don't ignore map/unmap error checks. Miscellaneous fixes for small problems Sanjay found while diagnosing issue/9 and issue/16 (corruption_testr failures). - log::Reader reports the record type when it finds an unexpected type. - log::Reader no longer reports an error when it encounters an expected zero record regardless of the setting of the "checksum" flag. - Added a missing forward declaration. - Documented a side-effects of larger write buffer sizes (longer recovery time). git-svn-id: http://leveldb.googlecode.com/svn/trunk@37 62dab493-f737-651d-591e-8d6aee1b9529 --- db/corruption_test.cc | 2 +- db/db_impl.cc | 19 +++++++++------ db/db_test.cc | 23 ++++++++++++++++-- db/dbformat.h | 8 +++++++ db/log_reader.cc | 27 ++++++++++++--------- db/version_set.cc | 19 +++++++++------ db/version_set.h | 12 +++++----- db/version_set_test.cc | 21 +++++++++++----- db/write_batch_internal.h | 2 ++ include/leveldb/options.h | 2 ++ util/env_posix.cc | 61 ++++++++++++++++++++++++++++------------------- 11 files changed, 131 insertions(+), 65 deletions(-) diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 8015101..69fa03a 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -295,7 +295,7 @@ TEST(CorruptionTest, CompactionInputError) { Build(10); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - const int last = config::kNumLevels - 1; + const int last = config::kMaxMemCompactLevel; ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last))); Corrupt(kTableFile, 100, 1); diff --git a/db/db_impl.cc b/db/db_impl.cc index 7556d5a..48056da 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -456,10 +456,13 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, // should not be added to the manifest. int level = 0; if (s.ok() && meta.file_size > 0) { - if (base != NULL && !base->OverlapInLevel(0, meta.smallest, meta.largest)) { - // Push to largest level we can without causing overlaps - while (level + 1 < config::kNumLevels && - !base->OverlapInLevel(level + 1, meta.smallest, meta.largest)) { + const Slice min_user_key = meta.smallest.user_key(); + const Slice max_user_key = meta.largest.user_key(); + if (base != NULL && !base->OverlapInLevel(0, min_user_key, max_user_key)) { + // Push the new sstable to a higher level if possible to reduce + // expensive manifest file ops. + while (level < config::kMaxMemCompactLevel && + !base->OverlapInLevel(level + 1, min_user_key, max_user_key)) { level++; } } @@ -1276,12 +1279,14 @@ Status DestroyDB(const std::string& dbname, const Options& options) { } FileLock* lock; - Status result = env->LockFile(LockFileName(dbname), &lock); + const std::string lockname = LockFileName(dbname); + Status result = env->LockFile(lockname, &lock); if (result.ok()) { uint64_t number; FileType type; for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { + if (ParseFileName(filenames[i], &number, &type) && + filenames[i] != lockname) { // Lock file will be deleted at end Status del = env->DeleteFile(dbname + "/" + filenames[i]); if (result.ok() && !del.ok()) { result = del; @@ -1289,7 +1294,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) { } } env->UnlockFile(lock); // Ignore error since state is already gone - env->DeleteFile(LockFileName(dbname)); + env->DeleteFile(lockname); env->DeleteDir(dbname); // Ignore error in case dir contains other files } return result; diff --git a/db/db_test.cc b/db/db_test.cc index d5d60cd..0ac29e6 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -650,6 +650,25 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { } } +TEST(DBTest, RepeatedWritesToSameKey) { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); + + // We must have at most one file per level except for level-0, + // which may have up to kL0_StopWritesTrigger files. + const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger; + + Random rnd(301); + std::string value = RandomString(&rnd, 2 * options.write_buffer_size); + for (int i = 0; i < 5 * kMaxFiles; i++) { + Put("key", value); + ASSERT_LE(TotalTableFiles(), kMaxFiles); + fprintf(stderr, "after %d: %d files\n", int(i+1), TotalTableFiles()); + } +} + TEST(DBTest, SparseMerge) { Options options; options.compression = kNoCompression; @@ -863,7 +882,7 @@ TEST(DBTest, HiddenValuesAreRemoved) { TEST(DBTest, DeletionMarkers1) { Put("foo", "v1"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); - const int last = config::kNumLevels - 1; + const int last = config::kMaxMemCompactLevel; ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level // Place a table at level last-1 to prevent merging with preceding mutation @@ -891,7 +910,7 @@ TEST(DBTest, DeletionMarkers1) { TEST(DBTest, DeletionMarkers2) { Put("foo", "v1"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); - const int last = config::kNumLevels - 1; + const int last = config::kMaxMemCompactLevel; ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level // Place a table at level last-1 to prevent merging with preceding mutation diff --git a/db/dbformat.h b/db/dbformat.h index 97491bc..ec1d193 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -29,6 +29,14 @@ static const int kL0_SlowdownWritesTrigger = 8; // Maximum number of level-0 files. We stop writes at this point. static const int kL0_StopWritesTrigger = 12; +// Maximum level to which a new compacted memtable is pushed if it +// does not create overlap. We try to push to level 2 to avoid the +// relatively expensive level 0=>1 compactions and to avoid some +// expensive manifest file operations. We do not push all the way to +// the largest level since that can generate a lot of wasted disk +// space if the same key space is being repeatedly overwritten. +static const int kMaxMemCompactLevel = 2; + } class InternalKey; diff --git a/db/log_reader.cc b/db/log_reader.cc index 8721071..fcb3aa7 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -4,6 +4,7 @@ #include "db/log_reader.h" +#include #include "leveldb/env.h" #include "util/coding.h" #include "util/crc32c.h" @@ -72,7 +73,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) { Slice fragment; while (true) { uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); - switch (ReadPhysicalRecord(&fragment)) { + const unsigned int record_type = ReadPhysicalRecord(&fragment); + switch (record_type) { case kFullType: if (in_fragmented_record) { // Handle bug in earlier versions of log::Writer where @@ -144,13 +146,16 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) { } break; - default: + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", record_type); ReportCorruption( (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), - "unknown record type"); + buf); in_fragmented_record = false; scratch->clear(); break; + } } } return false; @@ -212,16 +217,16 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { return kBadRecord; } + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + buffer_.clear(); + return kBadRecord; + } + // Check crc if (checksum_) { - if (type == kZeroType && length == 0) { - // Skip zero length record without reporting any drops since - // such records are produced by the mmap based writing code in - // env_posix.cc that preallocates file regions. - buffer_.clear(); - return kBadRecord; - } - uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); if (actual_crc != expected_crc) { diff --git a/db/version_set.cc b/db/version_set.cc index 54342e4..816f189 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -99,11 +99,14 @@ int FindFile(const InternalKeyComparator& icmp, bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, const std::vector& files, - const InternalKey& smallest, - const InternalKey& largest) { - const int index = FindFile(icmp, files, smallest.Encode()); + const Slice& smallest_user_key, + const Slice& largest_user_key) { + // Find the earliest possible internal key for smallest_user_key + InternalKey small(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); + const int index = FindFile(icmp, files, small.Encode()); return ((index < files.size()) && - icmp.Compare(largest, files[index]->smallest) >= 0); + icmp.user_comparator()->Compare( + largest_user_key, files[index]->smallest.user_key()) >= 0); } // An internal iterator. For a given version/level pair, yields @@ -353,9 +356,11 @@ void Version::Unref() { } bool Version::OverlapInLevel(int level, - const InternalKey& smallest, - const InternalKey& largest) { - return SomeFileOverlapsRange(vset_->icmp_, files_[level], smallest, largest); + const Slice& smallest_user_key, + const Slice& largest_user_key) { + return SomeFileOverlapsRange(vset_->icmp_, files_[level], + smallest_user_key, + largest_user_key); } std::string Version::DebugString() const { diff --git a/db/version_set.h b/db/version_set.h index f00c35a..693fc6f 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -42,13 +42,13 @@ extern int FindFile(const InternalKeyComparator& icmp, const std::vector& files, const Slice& key); -// Returns true iff some file in "files" overlaps some part of +// Returns true iff some file in "files" overlaps the user key range // [smallest,largest]. extern bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, const std::vector& files, - const InternalKey& smallest, - const InternalKey& largest); + const Slice& smallest_user_key, + const Slice& largest_user_key); class Version { public: @@ -78,10 +78,10 @@ class Version { void Unref(); // Returns true iff some file in the specified level overlaps - // some part of [smallest,largest]. + // some part of [smallest_user_key,largest_user_key]. bool OverlapInLevel(int level, - const InternalKey& smallest, - const InternalKey& largest); + const Slice& smallest_user_key, + const Slice& largest_user_key); int NumFiles(int level) const { return files_[level].size(); } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index eae2a80..ecfd62b 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -19,11 +19,13 @@ class FindFileTest { } } - void Add(const char* smallest, const char* largest) { + void Add(const char* smallest, const char* largest, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { FileMetaData* f = new FileMetaData; f->number = files_.size() + 1; - f->smallest = InternalKey(smallest, 100, kTypeValue); - f->largest = InternalKey(largest, 100, kTypeValue); + f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); + f->largest = InternalKey(largest, largest_seq, kTypeValue); files_.push_back(f); } @@ -34,10 +36,8 @@ class FindFileTest { } bool Overlaps(const char* smallest, const char* largest) { - InternalKey s(smallest, 100, kTypeValue); - InternalKey l(largest, 100, kTypeValue); InternalKeyComparator cmp(BytewiseComparator()); - return SomeFileOverlapsRange(cmp, files_, s, l); + return SomeFileOverlapsRange(cmp, files_, smallest, largest); } }; @@ -108,6 +108,15 @@ TEST(FindFileTest, Multiple) { ASSERT_TRUE(Overlaps("450", "500")); } +TEST(FindFileTest, OverlapSequenceChecks) { + Add("200", "200", 5000, 3000); + ASSERT_TRUE(! Overlaps("199", "199")); + ASSERT_TRUE(! Overlaps("201", "300")); + ASSERT_TRUE(Overlaps("200", "200")); + ASSERT_TRUE(Overlaps("190", "200")); + ASSERT_TRUE(Overlaps("200", "210")); +} + } int main(int argc, char** argv) { diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index d975444..6d65eed 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -9,6 +9,8 @@ namespace leveldb { +class MemTable; + // WriteBatchInternal provides static methods for manipulating a // WriteBatch that we don't want in the public WriteBatch interface. class WriteBatchInternal { diff --git a/include/leveldb/options.h b/include/leveldb/options.h index a94651f..a0afbf2 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -75,6 +75,8 @@ struct Options { // Larger values increase performance, especially during bulk loads. // Up to two write buffers may be held in memory at the same time, // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. // // Default: 4MB size_t write_buffer_size; diff --git a/util/env_posix.cc b/util/env_posix.cc index fec1599..46723e2 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -28,6 +28,10 @@ namespace leveldb { namespace { +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + class PosixSequentialFile: public SequentialFile { private: std::string filename_; @@ -47,7 +51,7 @@ class PosixSequentialFile: public SequentialFile { // We leave status as ok if we hit the end of the file } else { // A partial read with an error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); + s = IOError(filename_, errno); } } return s; @@ -55,7 +59,7 @@ class PosixSequentialFile: public SequentialFile { virtual Status Skip(uint64_t n) { if (fseek(file_, n, SEEK_CUR)) { - return Status::IOError(filename_, strerror(errno)); + return IOError(filename_, errno); } return Status::OK(); } @@ -78,7 +82,7 @@ class PosixRandomAccessFile: public RandomAccessFile { *result = Slice(scratch, (r < 0) ? 0 : r); if (r < 0) { // An error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); + s = IOError(filename_, errno); } return s; } @@ -114,13 +118,16 @@ class PosixMmapFile : public WritableFile { return s; } - void UnmapCurrentRegion() { + bool UnmapCurrentRegion() { + bool result = true; if (base_ != NULL) { if (last_sync_ < limit_) { // Defer syncing this data until next Sync() call, if any pending_sync_ = true; } - munmap(base_, limit_ - base_); + if (munmap(base_, limit_ - base_) != 0) { + result = false; + } file_offset_ += limit_ - base_; base_ = NULL; limit_ = NULL; @@ -132,6 +139,7 @@ class PosixMmapFile : public WritableFile { map_size_ *= 2; } } + return result; } bool MapNewRegion() { @@ -181,8 +189,10 @@ class PosixMmapFile : public WritableFile { assert(dst_ <= limit_); size_t avail = limit_ - dst_; if (avail == 0) { - UnmapCurrentRegion(); - MapNewRegion(); + if (!UnmapCurrentRegion() || + !MapNewRegion()) { + return IOError(filename_, errno); + } } size_t n = (left <= avail) ? left : avail; @@ -197,17 +207,18 @@ class PosixMmapFile : public WritableFile { virtual Status Close() { Status s; size_t unused = limit_ - dst_; - UnmapCurrentRegion(); - if (unused > 0) { + if (!UnmapCurrentRegion()) { + s = IOError(filename_, errno); + } else if (unused > 0) { // Trim the extra space at the end of the file if (ftruncate(fd_, file_offset_ - unused) < 0) { - s = Status::IOError(filename_, strerror(errno)); + s = IOError(filename_, errno); } } if (close(fd_) < 0) { if (s.ok()) { - s = Status::IOError(filename_, strerror(errno)); + s = IOError(filename_, errno); } } @@ -228,7 +239,7 @@ class PosixMmapFile : public WritableFile { // Some unmapped data was not synced pending_sync_ = false; if (fdatasync(fd_) < 0) { - s = Status::IOError(filename_, strerror(errno)); + s = IOError(filename_, errno); } } @@ -239,7 +250,7 @@ class PosixMmapFile : public WritableFile { size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); last_sync_ = dst_; if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { - s = Status::IOError(filename_, strerror(errno)); + s = IOError(filename_, errno); } } @@ -276,7 +287,7 @@ class PosixEnv : public Env { FILE* f = fopen(fname.c_str(), "r"); if (f == NULL) { *result = NULL; - return Status::IOError(fname, strerror(errno)); + return IOError(fname, errno); } else { *result = new PosixSequentialFile(fname, f); return Status::OK(); @@ -288,7 +299,7 @@ class PosixEnv : public Env { int fd = open(fname.c_str(), O_RDONLY); if (fd < 0) { *result = NULL; - return Status::IOError(fname, strerror(errno)); + return IOError(fname, errno); } *result = new PosixRandomAccessFile(fname, fd); return Status::OK(); @@ -300,7 +311,7 @@ class PosixEnv : public Env { const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); if (fd < 0) { *result = NULL; - s = Status::IOError(fname, strerror(errno)); + s = IOError(fname, errno); } else { *result = new PosixMmapFile(fname, fd, page_size_); } @@ -316,7 +327,7 @@ class PosixEnv : public Env { result->clear(); DIR* d = opendir(dir.c_str()); if (d == NULL) { - return Status::IOError(dir, strerror(errno)); + return IOError(dir, errno); } struct dirent* entry; while ((entry = readdir(d)) != NULL) { @@ -329,7 +340,7 @@ class PosixEnv : public Env { virtual Status DeleteFile(const std::string& fname) { Status result; if (unlink(fname.c_str()) != 0) { - result = Status::IOError(fname, strerror(errno)); + result = IOError(fname, errno); } return result; }; @@ -337,7 +348,7 @@ class PosixEnv : public Env { virtual Status CreateDir(const std::string& name) { Status result; if (mkdir(name.c_str(), 0755) != 0) { - result = Status::IOError(name, strerror(errno)); + result = IOError(name, errno); } return result; }; @@ -345,7 +356,7 @@ class PosixEnv : public Env { virtual Status DeleteDir(const std::string& name) { Status result; if (rmdir(name.c_str()) != 0) { - result = Status::IOError(name, strerror(errno)); + result = IOError(name, errno); } return result; }; @@ -355,7 +366,7 @@ class PosixEnv : public Env { struct stat sbuf; if (stat(fname.c_str(), &sbuf) != 0) { *size = 0; - s = Status::IOError(fname, strerror(errno)); + s = IOError(fname, errno); } else { *size = sbuf.st_size; } @@ -365,7 +376,7 @@ class PosixEnv : public Env { virtual Status RenameFile(const std::string& src, const std::string& target) { Status result; if (rename(src.c_str(), target.c_str()) != 0) { - result = Status::IOError(src, strerror(errno)); + result = IOError(src, errno); } return result; } @@ -375,9 +386,9 @@ class PosixEnv : public Env { Status result; int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); if (fd < 0) { - result = Status::IOError(fname, strerror(errno)); + result = IOError(fname, errno); } else if (LockOrUnlock(fd, true) == -1) { - result = Status::IOError("lock " + fname, strerror(errno)); + result = IOError("lock " + fname, errno); close(fd); } else { PosixFileLock* my_lock = new PosixFileLock; @@ -391,7 +402,7 @@ class PosixEnv : public Env { PosixFileLock* my_lock = reinterpret_cast(lock); Status result; if (LockOrUnlock(my_lock->fd_, false) == -1) { - result = Status::IOError(strerror(errno)); + result = IOError("unlock", errno); } close(my_lock->fd_); delete my_lock; -- cgit v1.2.3 From 51f892d349df9ed408f5a0e6e012667e5eae5f8b Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Tue, 19 Jul 2011 23:36:47 +0000 Subject: Sun Studio support, and fix for test related memory fixes. - LevelDB patch for Sun Studio Based on a patch submitted by Theo Schlossnagle - thanks! This fixes Issue 17. - Fix a couple of test related memory leaks. git-svn-id: http://leveldb.googlecode.com/svn/trunk@38 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_test.cc | 1 + db/dbformat.h | 2 +- db/filename.h | 2 +- db/log_format.h | 2 +- db/repair.cc | 5 +++++ db/skiplist_test.cc | 4 ++-- db/version_edit.cc | 2 +- db/version_set.cc | 11 ++++++++--- include/leveldb/options.h | 2 +- include/leveldb/status.h | 2 +- port/atomic_pointer.h | 5 +++-- table/table_test.cc | 9 +++++++-- util/comparator.cc | 1 + 13 files changed, 33 insertions(+), 15 deletions(-) diff --git a/db/db_test.cc b/db/db_test.cc index 0ac29e6..22fa70c 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -42,6 +42,7 @@ class SpecialEnv : public EnvWrapper { : env_(env), base_(base) { } + ~SSTableFile() { delete base_; } Status Append(const Slice& data) { return base_->Append(data); } Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } diff --git a/db/dbformat.h b/db/dbformat.h index ec1d193..7344cbf 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -46,7 +46,7 @@ class InternalKey; // data structures. enum ValueType { kTypeDeletion = 0x0, - kTypeValue = 0x1, + kTypeValue = 0x1 }; // kValueTypeForSeek defines the ValueType that should be passed when // constructing a ParsedInternalKey object for seeking to a particular diff --git a/db/filename.h b/db/filename.h index 6a99744..e9ec8a7 100644 --- a/db/filename.h +++ b/db/filename.h @@ -24,7 +24,7 @@ enum FileType { kDescriptorFile, kCurrentFile, kTempFile, - kInfoLogFile, // Either the current one, or an old one + kInfoLogFile // Either the current one, or an old one }; // Return the name of the log file with the specified number diff --git a/db/log_format.h b/db/log_format.h index 137cd4a..353eff8 100644 --- a/db/log_format.h +++ b/db/log_format.h @@ -20,7 +20,7 @@ enum RecordType { // For fragments kFirstType = 2, kMiddleType = 3, - kLastType = 4, + kLastType = 4 }; static const int kMaxRecordType = kLastType; diff --git a/db/repair.cc b/db/repair.cc index ae1b136..2e3f506 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -50,6 +50,7 @@ class Repairer { icmp_(options.comparator), options_(SanitizeOptions(dbname, &icmp_, options)), owns_info_log_(options_.info_log != options.info_log), + owns_cache_(options_.block_cache != options.block_cache), next_file_number_(1) { // TableCache can be small since we expect each table to be opened once. table_cache_ = new TableCache(dbname_, &options_, 10); @@ -60,6 +61,9 @@ class Repairer { if (owns_info_log_) { delete options_.info_log; } + if (owns_cache_) { + delete options_.block_cache; + } } Status Run() { @@ -97,6 +101,7 @@ class Repairer { InternalKeyComparator const icmp_; Options const options_; bool owns_info_log_; + bool owns_cache_; TableCache* table_cache_; VersionEdit edit_; diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc index 5f9ec0d..2bd8d22 100644 --- a/db/skiplist_test.cc +++ b/db/skiplist_test.cc @@ -238,14 +238,14 @@ class ConcurrentTest { current = MakeKey(K, 0); } else { current = iter.key(); - ASSERT_TRUE(IsValidKey(current)) << std::hex << current; + ASSERT_TRUE(IsValidKey(current)) << current; } ASSERT_LE(pos, current) << "should not go backwards"; // Verify that everything in [pos,current) was not present in // initial_state. while (pos < current) { - ASSERT_LT(key(pos), K) << std::hex << pos; + ASSERT_LT(key(pos), K) << pos; // Note that generation 0 is never inserted, so it is ok if // <*,0,*> is missing. diff --git a/db/version_edit.cc b/db/version_edit.cc index 3941271..f6b9e9c 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -20,7 +20,7 @@ enum Tag { kDeletedFile = 6, kNewFile = 7, // 8 was used for large value refs - kPrevLogNumber = 9, + kPrevLogNumber = 9 }; void VersionEdit::Clear() { diff --git a/db/version_set.cc b/db/version_set.cc index 816f189..62bd6dd 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -434,9 +434,14 @@ class VersionSet::Builder { ~Builder() { for (int level = 0; level < config::kNumLevels; level++) { - std::vector to_unref(levels_[level].added_files->begin(), - levels_[level].added_files->end()); - delete levels_[level].added_files; + const FileSet* added = levels_[level].added_files; + std::vector to_unref; + to_unref.reserve(added->size()); + for (FileSet::const_iterator it = added->begin(); + it != added->end(); ++it) { + to_unref.push_back(*it); + } + delete added; for (int i = 0; i < to_unref.size(); i++) { FileMetaData* f = to_unref[i]; f->refs--; diff --git a/include/leveldb/options.h b/include/leveldb/options.h index a0afbf2..0d4f6cd 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -23,7 +23,7 @@ enum CompressionType { // NOTE: do not change the values of existing entries, as these are // part of the persistent format on disk. kNoCompression = 0x0, - kSnappyCompression = 0x1, + kSnappyCompression = 0x1 }; // Options to control the behavior of a database (passed to DB::Open) diff --git a/include/leveldb/status.h b/include/leveldb/status.h index 6796fdd..8fe4442 100644 --- a/include/leveldb/status.h +++ b/include/leveldb/status.h @@ -72,7 +72,7 @@ class Status { kCorruption = 2, kNotSupported = 3, kInvalidArgument = 4, - kIOError = 5, + kIOError = 5 }; Code code() const { diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h index 7659840..c618778 100644 --- a/port/atomic_pointer.h +++ b/port/atomic_pointer.h @@ -48,8 +48,9 @@ namespace port { // http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx #define LEVELDB_HAVE_MEMORY_BARRIER -// Gcc on x86 -#elif defined(__GNUC__) && defined(ARCH_CPU_X86_FAMILY) +// Gcc and Sun Studio on x86 +#elif defined(ARCH_CPU_X86_FAMILY) && \ + (defined(__GNUC__) || defined(__SUNPRO_CC)) inline void MemoryBarrier() { // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. diff --git a/table/table_test.cc b/table/table_test.cc index 10d08fc..c69a386 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -5,6 +5,7 @@ #include "leveldb/table.h" #include +#include #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" @@ -25,7 +26,11 @@ namespace leveldb { // Used to test non-lexicographic comparators. static std::string Reverse(const Slice& key) { std::string str(key.ToString()); - std::string rev(str.rbegin(), str.rend()); + std::string rev(""); + for (std::string::reverse_iterator rit = str.rbegin(); + rit != str.rend(); ++rit) { + rev.push_back(*rit); + } return rev; } @@ -411,7 +416,7 @@ enum TestType { TABLE_TEST, BLOCK_TEST, MEMTABLE_TEST, - DB_TEST, + DB_TEST }; struct TestArgs { diff --git a/util/comparator.cc b/util/comparator.cc index cc2b263..2d7544d 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include #include #include "leveldb/comparator.h" #include "leveldb/slice.h" -- cgit v1.2.3 From f65a55c8d0744b95be29a65d06b59b22b012f37b Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Thu, 21 Jul 2011 02:40:18 +0000 Subject: Speed up Snappy uncompression, new Logger interface. - Removed one copy of an uncompressed block contents changing the signature of Snappy_Uncompress() so it uncompresses into a flat array instead of a std::string. Speeds up readrandom ~10%. - Instead of a combination of Env/WritableFile, we now have a Logger interface that can be easily overridden applications that want to supply their own logging. - Separated out the gcc and Sun Studio parts of atomic_pointer.h so we can use 'asm', 'volatile' keywords for Sun Studio. git-svn-id: http://leveldb.googlecode.com/svn/trunk@39 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_bench.cc | 7 ++-- db/db_impl.cc | 45 +++++++++------------- db/repair.cc | 20 +++++----- db/version_set.cc | 4 +- include/leveldb/env.h | 29 +++++++++++--- include/leveldb/options.h | 6 +-- port/atomic_pointer.h | 14 +++++-- port/port_android.h | 8 +++- port/port_chromium.cc | 18 +++++---- port/port_chromium.h | 4 +- port/port_example.h | 12 +++++- port/port_posix.h | 26 +++++++------ table/format.cc | 20 ++++++---- util/env.cc | 15 +++++--- util/env_chromium.cc | 74 +++++++----------------------------- util/env_posix.cc | 74 ++++++------------------------------ util/posix_logger.h | 97 +++++++++++++++++++++++++++++++++++++++++++++++ 17 files changed, 261 insertions(+), 212 deletions(-) create mode 100644 util/posix_logger.h diff --git a/db/db_bench.cc b/db/db_bench.cc index 53b8c53..7b4e41a 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -472,13 +472,14 @@ class Benchmark { std::string compressed; bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); int64_t bytes = 0; - std::string uncompressed; + char* uncompressed = new char[input.size()]; while (ok && bytes < 1024 * 1048576) { // Compress 1G ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), - &uncompressed); - bytes += uncompressed.size(); + uncompressed); + bytes += input.size(); FinishedSingleOp(); } + delete[] uncompressed; if (!ok) { message_ = "(snappy failure)"; diff --git a/db/db_impl.cc b/db/db_impl.cc index 48056da..5a0648e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -68,16 +68,6 @@ struct DBImpl::CompactionState { } }; -namespace { -class NullWritableFile : public WritableFile { - public: - virtual Status Append(const Slice& data) { return Status::OK(); } - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } -}; -} - // Fix user-supplied options to be reasonable template static void ClipToRange(T* ptr, V minvalue, V maxvalue) { @@ -96,11 +86,10 @@ Options SanitizeOptions(const std::string& dbname, // Open a log file in the same directory as the db src.env->CreateDir(dbname); // In case it does not exist src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); - Status s = src.env->NewWritableFile(InfoLogFileName(dbname), - &result.info_log); + Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log); if (!s.ok()) { // No place suitable for logging - result.info_log = new NullWritableFile; + result.info_log = NULL; } } if (result.block_cache == NULL) { @@ -201,7 +190,7 @@ void DBImpl::MaybeIgnoreError(Status* s) const { if (s->ok() || options_.paranoid_checks) { // No change needed } else { - Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str()); + Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); *s = Status::OK(); } } @@ -247,7 +236,7 @@ void DBImpl::DeleteObsoleteFiles() { if (type == kTableFile) { table_cache_->Evict(number); } - Log(env_, options_.info_log, "Delete type=%d #%lld\n", + Log(options_.info_log, "Delete type=%d #%lld\n", int(type), static_cast(number)); env_->DeleteFile(dbname_ + "/" + filenames[i]); @@ -336,11 +325,11 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence) { struct LogReporter : public log::Reader::Reporter { Env* env; - WritableFile* info_log; + Logger* info_log; const char* fname; Status* status; // NULL if options_.paranoid_checks==false virtual void Corruption(size_t bytes, const Status& s) { - Log(env, info_log, "%s%s: dropping %d bytes; %s", + Log(info_log, "%s%s: dropping %d bytes; %s", (this->status == NULL ? "(ignoring error) " : ""), fname, static_cast(bytes), s.ToString().c_str()); if (this->status != NULL && this->status->ok()) *this->status = s; @@ -370,7 +359,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, // large sequence numbers). log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/); - Log(env_, options_.info_log, "Recovering log #%llu", + Log(options_.info_log, "Recovering log #%llu", (unsigned long long) log_number); // Read all the records and add to a memtable @@ -434,7 +423,7 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, meta.number = versions_->NewFileNumber(); pending_outputs_.insert(meta.number); Iterator* iter = mem->NewIterator(); - Log(env_, options_.info_log, "Level-0 table #%llu: started", + Log(options_.info_log, "Level-0 table #%llu: started", (unsigned long long) meta.number); Status s; @@ -444,7 +433,7 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, mutex_.Lock(); } - Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s", + Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s", (unsigned long long) meta.number, (unsigned long long) meta.file_size, s.ToString().c_str()); @@ -613,7 +602,7 @@ void DBImpl::BackgroundCompaction() { f->smallest, f->largest); status = versions_->LogAndApply(c->edit()); VersionSet::LevelSummaryStorage tmp; - Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", + Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", static_cast(f->number), c->level() + 1, static_cast(f->file_size), @@ -631,7 +620,7 @@ void DBImpl::BackgroundCompaction() { } else if (shutting_down_.Acquire_Load()) { // Ignore compaction errors found during shutting down } else { - Log(env_, options_.info_log, + Log(options_.info_log, "Compaction error: %s", status.ToString().c_str()); if (options_.paranoid_checks && bg_error_.ok()) { bg_error_ = status; @@ -727,7 +716,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, s = iter->status(); delete iter; if (s.ok()) { - Log(env_, options_.info_log, + Log(options_.info_log, "Generated table #%llu: %lld keys, %lld bytes", (unsigned long long) output_number, (unsigned long long) current_entries, @@ -740,7 +729,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, Status DBImpl::InstallCompactionResults(CompactionState* compact) { mutex_.AssertHeld(); - Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", + Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), @@ -776,7 +765,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { const uint64_t start_micros = env_->NowMicros(); int64_t imm_micros = 0; // Micros spent doing imm_ compactions - Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files", + Log(options_.info_log, "Compacting %d@%d + %d@%d files", compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), @@ -859,7 +848,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { last_sequence_for_key = ikey.sequence; } #if 0 - Log(env_, options_.info_log, + Log(options_.info_log, " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " "%d smallest_snapshot: %d", ikey.user_key.ToString().c_str(), @@ -925,7 +914,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { status = InstallCompactionResults(compact); } VersionSet::LevelSummaryStorage tmp; - Log(env_, options_.info_log, + Log(options_.info_log, "compacted to: %s", versions_->LevelSummary(&tmp)); return status; } @@ -1112,7 +1101,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { bg_cv_.Wait(); } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { // There are too many level-0 files. - Log(env_, options_.info_log, "waiting...\n"); + Log(options_.info_log, "waiting...\n"); bg_cv_.Wait(); } else { // Attempt to switch to a new memtable and trigger compaction of old diff --git a/db/repair.cc b/db/repair.cc index 2e3f506..5bcdb56 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -78,7 +78,7 @@ class Repairer { for (size_t i = 0; i < tables_.size(); i++) { bytes += tables_[i].meta.file_size; } - Log(env_, options_.info_log, + Log(options_.info_log, "**** Repaired leveldb %s; " "recovered %d files; %llu bytes. " "Some data may have been lost. " @@ -149,7 +149,7 @@ class Repairer { std::string logname = LogFileName(dbname_, logs_[i]); Status status = ConvertLogToTable(logs_[i]); if (!status.ok()) { - Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s", + Log(options_.info_log, "Log #%llu: ignoring conversion error: %s", (unsigned long long) logs_[i], status.ToString().c_str()); } @@ -160,11 +160,11 @@ class Repairer { Status ConvertLogToTable(uint64_t log) { struct LogReporter : public log::Reader::Reporter { Env* env; - WritableFile* info_log; + Logger* info_log; uint64_t lognum; virtual void Corruption(size_t bytes, const Status& s) { // We print error messages for corruption, but continue repairing. - Log(env, info_log, "Log #%llu: dropping %d bytes; %s", + Log(info_log, "Log #%llu: dropping %d bytes; %s", (unsigned long long) lognum, static_cast(bytes), s.ToString().c_str()); @@ -209,7 +209,7 @@ class Repairer { if (status.ok()) { counter += WriteBatchInternal::Count(&batch); } else { - Log(env_, options_.info_log, "Log #%llu: ignoring %s", + Log(options_.info_log, "Log #%llu: ignoring %s", (unsigned long long) log, status.ToString().c_str()); status = Status::OK(); // Keep going with rest of file @@ -231,7 +231,7 @@ class Repairer { table_numbers_.push_back(meta.number); } } - Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", + Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", (unsigned long long) log, counter, (unsigned long long) meta.number, @@ -247,7 +247,7 @@ class Repairer { Status status = ScanTable(&t); if (!status.ok()) { std::string fname = TableFileName(dbname_, table_numbers_[i]); - Log(env_, options_.info_log, "Table #%llu: ignoring %s", + Log(options_.info_log, "Table #%llu: ignoring %s", (unsigned long long) table_numbers_[i], status.ToString().c_str()); ArchiveFile(fname); @@ -270,7 +270,7 @@ class Repairer { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); if (!ParseInternalKey(key, &parsed)) { - Log(env_, options_.info_log, "Table #%llu: unparsable key %s", + Log(options_.info_log, "Table #%llu: unparsable key %s", (unsigned long long) t->meta.number, EscapeString(key).c_str()); continue; @@ -291,7 +291,7 @@ class Repairer { } delete iter; } - Log(env_, options_.info_log, "Table #%llu: %d entries %s", + Log(options_.info_log, "Table #%llu: %d entries %s", (unsigned long long) t->meta.number, counter, status.ToString().c_str()); @@ -373,7 +373,7 @@ class Repairer { new_file.append("/"); new_file.append((slash == NULL) ? fname.c_str() : slash + 1); Status s = env_->RenameFile(fname, new_file); - Log(env_, options_.info_log, "Archiving %s: %s\n", + Log(options_.info_log, "Archiving %s: %s\n", fname.c_str(), s.ToString().c_str()); } }; diff --git a/db/version_set.cc b/db/version_set.cc index 62bd6dd..5040b72 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1124,7 +1124,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) { std::vector expanded1; GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); if (expanded1.size() == c->inputs_[1].size()) { - Log(env_, options_->info_log, + Log(options_->info_log, "Expanding@%d %d+%d to %d+%d\n", level, int(c->inputs_[0].size()), @@ -1147,7 +1147,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) { } if (false) { - Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'", + Log(options_->info_log, "Compacting %d '%s' .. '%s'", level, EscapeString(smallest.Encode()).c_str(), EscapeString(largest.Encode()).c_str()); diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 39f6a1a..bf51008 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -22,6 +22,7 @@ namespace leveldb { class FileLock; +class Logger; class RandomAccessFile; class SequentialFile; class Slice; @@ -134,8 +135,8 @@ class Env { // same directory. virtual Status GetTestDirectory(std::string* path) = 0; - // Write an entry to the log file with the specified format. - virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0; + // Create and return a log file for storing informational messages. + virtual Status NewLogger(const std::string& fname, Logger** result) = 0; // Returns the number of micro-seconds since some fixed point in time. Only // useful for computing deltas of time. @@ -210,6 +211,22 @@ class WritableFile { void operator=(const WritableFile&); }; +// An interface for writing log messages. +class Logger { + public: + Logger() { } + virtual ~Logger(); + + // Write an entry to the log file with the specified format. + virtual void Logv(const char* format, va_list ap) = 0; + + private: + // No copying allowed + Logger(const Logger&); + void operator=(const Logger&); +}; + + // Identifies a locked file. class FileLock { public: @@ -222,9 +239,9 @@ class FileLock { }; // Log the specified data to *info_log if info_log is non-NULL. -extern void Log(Env* env, WritableFile* info_log, const char* format, ...) +extern void Log(Logger* info_log, const char* format, ...) # if defined(__GNUC__) || defined(__clang__) - __attribute__((__format__ (__printf__, 3, 4))) + __attribute__((__format__ (__printf__, 2, 3))) # endif ; @@ -284,8 +301,8 @@ class EnvWrapper : public Env { virtual Status GetTestDirectory(std::string* path) { return target_->GetTestDirectory(path); } - virtual void Logv(WritableFile* log, const char* format, va_list ap) { - return target_->Logv(log, format, ap); + virtual Status NewLogger(const std::string& fname, Logger** result) { + return target_->NewLogger(fname, result); } uint64_t NowMicros() { return target_->NowMicros(); diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 0d4f6cd..381f228 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -12,8 +12,8 @@ namespace leveldb { class Cache; class Comparator; class Env; +class Logger; class Snapshot; -class WritableFile; // DB contents are stored in a set of blocks, each of which holds a // sequence of key,value pairs. Each block may be compressed before @@ -61,10 +61,10 @@ struct Options { Env* env; // Any internal progress/error information generated by the db will - // be to written to info_log if it is non-NULL, or to a file stored + // be written to info_log if it is non-NULL, or to a file stored // in the same directory as the DB contents if info_log is NULL. // Default: NULL - WritableFile* info_log; + Logger* info_log; // ------------------- // Parameters that affect performance diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h index c618778..c20b1bd 100644 --- a/port/atomic_pointer.h +++ b/port/atomic_pointer.h @@ -48,9 +48,8 @@ namespace port { // http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx #define LEVELDB_HAVE_MEMORY_BARRIER -// Gcc and Sun Studio on x86 -#elif defined(ARCH_CPU_X86_FAMILY) && \ - (defined(__GNUC__) || defined(__SUNPRO_CC)) +// Gcc on x86 +#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__) inline void MemoryBarrier() { // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. @@ -58,6 +57,15 @@ inline void MemoryBarrier() { } #define LEVELDB_HAVE_MEMORY_BARRIER +// Sun Studio +#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC) +inline void MemoryBarrier() { + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + asm volatile("" : : : "memory"); +} +#define LEVELDB_HAVE_MEMORY_BARRIER + // Mac OS #elif defined(OS_MACOSX) inline void MemoryBarrier() { diff --git a/port/port_android.h b/port/port_android.h index 13df9c9..d68b6c0 100644 --- a/port/port_android.h +++ b/port/port_android.h @@ -125,11 +125,17 @@ inline bool Snappy_Compress( return false; } +// TODO(gabor): Implement uncompress +inline bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { + return false; +} + // TODO(gabor): Implement uncompress inline bool Snappy_Uncompress( const char* input_data, size_t input_length, - std::string* output) { + char* output) { return false; } diff --git a/port/port_chromium.cc b/port/port_chromium.cc index 2ab49b9..7f6de92 100644 --- a/port/port_chromium.cc +++ b/port/port_chromium.cc @@ -62,15 +62,19 @@ bool Snappy_Compress(const char* input, size_t input_length, #endif } +bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { +#if defined(USE_SNAPPY) + return snappy::GetUncompressedLength(input_data, input_length, result); +#else + return false; +#endif +} + bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output) { + char* output) { #if defined(USE_SNAPPY) - size_t ulength; - if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { - return false; - } - output->resize(ulength); - return snappy::RawUncompress(input_data, input_length, &(*output)[0]); + return snappy::RawUncompress(input_data, input_length, output); #else return false; #endif diff --git a/port/port_chromium.h b/port/port_chromium.h index 1851e6e..feecd5b 100644 --- a/port/port_chromium.h +++ b/port/port_chromium.h @@ -84,8 +84,10 @@ class AtomicPointer { bool Snappy_Compress(const char* input, size_t input_length, std::string* output); +bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result); bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output); + char* output); inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; diff --git a/port/port_example.h b/port/port_example.h index 8a624f3..6bd9b49 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -96,11 +96,21 @@ class AtomicPointer { extern bool Snappy_Compress(const char* input, size_t input_length, std::string* output); +// If input[0,input_length-1] looks like a valid snappy compressed +// buffer, store the size of the uncompressed data in *result and +// return true. Else return false. +extern bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result); + // Attempt to snappy uncompress input[0,input_length-1] into *output. // Returns true if successful, false if the input is invalid lightweight // compressed data. +// +// REQUIRES: at least the first "n" bytes of output[] must be writable +// where "n" is the result of a successful call to +// Snappy_GetUncompressedLength. extern bool Snappy_Uncompress(const char* input_data, size_t input_length, - std::string* output); + char* output); // ------------------ Miscellaneous ------------------- diff --git a/port/port_posix.h b/port/port_posix.h index 2995026..ef01de3 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -80,12 +80,12 @@ class CondVar { Mutex* mu_; }; -inline bool Snappy_Compress(const char* input, size_t input_length, +inline bool Snappy_Compress(const char* input, size_t length, ::std::string* output) { #ifdef SNAPPY - output->resize(snappy::MaxCompressedLength(input_length)); + output->resize(snappy::MaxCompressedLength(length)); size_t outlen; - snappy::RawCompress(input, input_length, &(*output)[0], &outlen); + snappy::RawCompress(input, length, &(*output)[0], &outlen); output->resize(outlen); return true; #endif @@ -93,18 +93,22 @@ inline bool Snappy_Compress(const char* input, size_t input_length, return false; } -inline bool Snappy_Uncompress(const char* input_data, size_t input_length, - ::std::string* output) { +inline bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { #ifdef SNAPPY - size_t ulength; - if (!snappy::GetUncompressedLength(input_data, input_length, &ulength)) { - return false; - } - output->resize(ulength); - return snappy::RawUncompress(input_data, input_length, &(*output)[0]); + return snappy::GetUncompressedLength(input, length, result); +#else + return false; #endif +} +inline bool Snappy_Uncompress(const char* input, size_t length, + char* output) { +#ifdef SNAPPY + return snappy::RawUncompress(input, length, output); +#else return false; +#endif } inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { diff --git a/table/format.cc b/table/format.cc index 63971db..ba7838c 100644 --- a/table/format.cc +++ b/table/format.cc @@ -107,16 +107,20 @@ Status ReadBlock(RandomAccessFile* file, // Ok break; case kSnappyCompression: { - std::string decompressed; - if (!port::Snappy_Uncompress(data, n, &decompressed)) { + size_t ulength = 0; + if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { delete[] buf; - s = Status::Corruption("corrupted compressed block contents"); - return s; + return Status::Corruption("corrupted compressed block contents"); } - delete[] buf; // Done with uncompressed data - buf = new char[decompressed.size()]; - memcpy(buf, decompressed.data(), decompressed.size()); - n = decompressed.size(); + char* ubuf = new char[ulength]; + if (!port::Snappy_Uncompress(data, n, ubuf)) { + delete[] buf; + delete[] ubuf; + return Status::Corruption("corrupted compressed block contents"); + } + delete[] buf; + buf = ubuf; + n = ulength; break; } default: diff --git a/util/env.cc b/util/env.cc index e5297e7..79e493e 100644 --- a/util/env.cc +++ b/util/env.cc @@ -18,14 +18,19 @@ RandomAccessFile::~RandomAccessFile() { WritableFile::~WritableFile() { } +Logger::~Logger() { +} + FileLock::~FileLock() { } -void Log(Env* env, WritableFile* info_log, const char* format, ...) { - va_list ap; - va_start(ap, format); - env->Logv(info_log, format, ap); - va_end(ap); +void Log(Logger* info_log, const char* format, ...) { + if (info_log != NULL) { + va_list ap; + va_start(ap, format); + info_log->Logv(format, ap); + va_end(ap); + } } Status WriteStringToFile(Env* env, const Slice& data, diff --git a/util/env_chromium.cc b/util/env_chromium.cc index 1af525a..975386b 100644 --- a/util/env_chromium.cc +++ b/util/env_chromium.cc @@ -23,6 +23,7 @@ #include "leveldb/slice.h" #include "port/port.h" #include "util/logging.h" +#include "util/posix_logger.h" #if defined(OS_WIN) #include @@ -406,9 +407,8 @@ class ChromiumEnv : public Env { return Status::OK(); } - virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { - // TODO(jorlow): We may want to just use Chromium's built in logging. - + // TODO(user,user): Use Chromium's built-in logging? + static uint64_t gettid() { uint64_t thread_id = 0; // Coppied from base/logging.cc. #if defined(OS_WIN) @@ -422,65 +422,17 @@ class ChromiumEnv : public Env { pthread_t tid = pthread_self(); memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); #endif + return thread_id; + } - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - ::base::Time::Exploded t; - ::base::Time::Now().LocalExplode(&t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.year, - t.month, - t.day_of_month, - t.hour, - t.minute, - t.second, - static_cast(t.millisecond) * 1000, - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - info_log->Append(Slice(base, p - base)); - info_log->Flush(); - if (base != buffer) { - delete[] base; - } - break; + virtual Status NewLogger(const std::string& fname, Logger** result) { + FILE* f = fopen(fname.c_str(), "w"); + if (f == NULL) { + *result = NULL; + return Status::IOError(fname, strerror(errno)); + } else { + *result = new PosixLogger(f, &ChromiumEnv::gettid); + return Status::OK(); } } diff --git a/util/env_posix.cc b/util/env_posix.cc index 46723e2..5127c89 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -23,6 +23,7 @@ #include "leveldb/slice.h" #include "port/port.h" #include "util/logging.h" +#include "util/posix_logger.h" namespace leveldb { @@ -427,72 +428,21 @@ class PosixEnv : public Env { return Status::OK(); } - virtual void Logv(WritableFile* info_log, const char* format, va_list ap) { + static uint64_t gettid() { pthread_t tid = pthread_self(); uint64_t thread_id = 0; memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + return thread_id; + } - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - struct timeval now_tv; - gettimeofday(&now_tv, NULL); - const time_t seconds = now_tv.tv_sec; - struct tm t; - localtime_r(&seconds, &t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.tm_year + 1900, - t.tm_mon + 1, - t.tm_mday, - t.tm_hour, - t.tm_min, - t.tm_sec, - static_cast(now_tv.tv_usec), - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - info_log->Append(Slice(base, p - base)); - info_log->Flush(); - if (base != buffer) { - delete[] base; - } - break; + virtual Status NewLogger(const std::string& fname, Logger** result) { + FILE* f = fopen(fname.c_str(), "w"); + if (f == NULL) { + *result = NULL; + return IOError(fname, errno); + } else { + *result = new PosixLogger(f, &PosixEnv::gettid); + return Status::OK(); } } diff --git a/util/posix_logger.h b/util/posix_logger.h new file mode 100644 index 0000000..0dbdeaa --- /dev/null +++ b/util/posix_logger.h @@ -0,0 +1,97 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// Author: sanjay@google.com (Sanjay Ghemawat) +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#ifndef STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ +#define STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ + +#include +#include +#include +#include +#include "leveldb/env.h" + +namespace leveldb { + +class PosixLogger : public Logger { + private: + FILE* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + public: + PosixLogger(FILE* f, uint64_t (*gettid)()) : file_(f), gettid_(gettid) { } + virtual ~PosixLogger() { + fclose(file_); + } + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, NULL); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + fwrite(base, 1, p - base, file_); + fflush(file_); + if (base != buffer) { + delete[] base; + } + break; + } + } +}; + +} + +#endif // STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ -- cgit v1.2.3 From fcd2d5698e5723d926ddb8451830ccaf55126dd5 Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Wed, 27 Jul 2011 01:46:25 +0000 Subject: Adding FreeBSD support, removing Chromium files, adding benchmark. - LevelDB patch for FreeBSD. This resolves Issue 22. Contributed by dforsythe (thanks!). - Removing Chromium-specific files. They are now going to live in the Chromium repository. - Adding a benchmark page comparing LevelDB performance to SQLite and Kyoto Cabinet's TreeDB, along with code to generate the benchmarks. Thanks to Kevin Tseng for compiling the benchmarks, and Scott Hess and Mikio Hirabayashi for their help and advice. git-svn-id: http://leveldb.googlecode.com/svn/trunk@40 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 9 +- build_detect_platform | 5 + doc/bench/db_bench_sqlite3.cc | 682 ++++++++++++++++++++++++++++++++++++++++++ doc/bench/db_bench_tree_db.cc | 506 +++++++++++++++++++++++++++++++ doc/benchmark.html | 466 +++++++++++++++++++++++++++++ leveldb.gyp | 325 -------------------- port/port_chromium.cc | 84 ------ port/port_chromium.h | 99 ------ port/port_posix.h | 6 +- util/env_chromium.cc | 564 ---------------------------------- 10 files changed, 1670 insertions(+), 1076 deletions(-) create mode 100644 doc/bench/db_bench_sqlite3.cc create mode 100644 doc/bench/db_bench_tree_db.cc create mode 100644 doc/benchmark.html delete mode 100644 leveldb.gyp delete mode 100644 port/port_chromium.cc delete mode 100644 port/port_chromium.h delete mode 100644 util/env_chromium.cc diff --git a/Makefile b/Makefile index ca0dabc..d198d9d 100644 --- a/Makefile +++ b/Makefile @@ -97,6 +97,7 @@ TESTS = \ write_batch_test PROGRAMS = db_bench $(TESTS) +BENCHMARKS = db_bench_sqlite3 db_bench_tree_db LIBRARY = libleveldb.a @@ -106,7 +107,7 @@ check: $(PROGRAMS) $(TESTS) for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done clean: - -rm -f $(PROGRAMS) $(LIBRARY) */*.o ios-x86/*/*.o ios-arm/*/*.o + -rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) */*.o */*/*.o ios-x86/*/*.o ios-arm/*/*.o -rm -rf ios-x86/* ios-arm/* -rm build_config.mk @@ -117,6 +118,12 @@ $(LIBRARY): $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ +db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) + $(CC) $(LDFLAGS) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + +db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) + $(CC) $(LDFLAGS) -lkyotocabinet doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ diff --git a/build_detect_platform b/build_detect_platform index bb108f7..d1804e0 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -30,6 +30,11 @@ case `uname -s` in echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_SOLARIS" >> build_config.mk echo "PLATFORM_LDFLAGS=-lpthread -lrt" >> build_config.mk ;; + FreeBSD) + PLATFORM=OS_FREEBSD + echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_FREEBSD" >> build_config.mk + echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk + ;; *) echo "Unknown platform!" exit 1 diff --git a/doc/bench/db_bench_sqlite3.cc b/doc/bench/db_bench_sqlite3.cc new file mode 100644 index 0000000..a6f9a75 --- /dev/null +++ b/doc/bench/db_bench_sqlite3.cc @@ -0,0 +1,682 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "util/histogram.h" +#include "util/random.h" +#include "util/testutil.h" + +// Comma-separated list of operations to run in the specified order +// Actual benchmarks: +// +// fillseq -- write N values in sequential key order in async mode +// fillseqsync -- write N/100 values in sequential key order in sync mode +// fillseqbatch -- batch write N values in sequential key order in async mode +// fillrandom -- write N values in random key order in async mode +// fillrandsync -- write N/100 values in random key order in sync mode +// fillrandbatch -- batch write N values in sequential key order in async mode +// overwrite -- overwrite N values in random key order in async mode +// fillrand100K -- write N/1000 100K values in random order in async mode +// fillseq100K -- write N/1000 100K values in sequential order in async mode +// readseq -- read N times sequentially +// readrandom -- read N times in random order +// readseq100K -- read N/1000 100K values in sequential order in async mode +// readrand100K -- read N/1000 100K values in sequential order in async mode +static const char* FLAGS_benchmarks = + "fillseq," + "fillseqsync," + "fillseqbatch," + "fillrandom," + "fillrandsync," + "fillrandbatch," + "overwrite," + "overwritebatch," + "readrandom," + "readseq," + "fillrand100K," + "fillseq100K," + "readseq100K," + "readrand100K," + ; + +// Number of key/values to place in database +static int FLAGS_num = 1000000; + +// Number of read operations to do. If negative, do FLAGS_num reads. +static int FLAGS_reads = -1; + +// Size of each value +static int FLAGS_value_size = 100; + +// Print histogram of operation timings +static bool FLAGS_histogram = false; + +// Arrange to generate values that shrink to this fraction of +// their original size after compression +static double FLAGS_compression_ratio = 0.5; + +// Page size. Default 1 KB. +static int FLAGS_page_size = 1024; + +// Number of pages. +// Default cache size = FLAGS_page_size * FLAGS_num_pages = 4 MB. +static int FLAGS_num_pages = 4096; + +// If true, do not destroy the existing database. If you set this +// flag and also specify a benchmark that wants a fresh database, that +// benchmark will fail. +static bool FLAGS_use_existing_db = false; + +// If true, we allow batch writes to occur +static bool FLAGS_transaction = true; + +// If true, we enable Write-Ahead Logging +static bool FLAGS_WAL_enabled = false; + +inline +static void ExecErrorCheck(int status, char *err_msg) { + if (status != SQLITE_OK) { + fprintf(stderr, "SQL error: %s\n", err_msg); + sqlite3_free(err_msg); + exit(1); + } +} + +inline +static void StepErrorCheck(int status) { + if (status != SQLITE_DONE) { + fprintf(stderr, "SQL step error: status = %d\n", status); + exit(1); + } +} + +inline +static void ErrorCheck(int status) { + if (status != SQLITE_OK) { + fprintf(stderr, "sqlite3 error: status = %d\n", status); + exit(1); + } +} + +inline +static void WalCheckpoint(sqlite3* db_) { + // Flush all writes to disk + if (FLAGS_WAL_enabled) { + sqlite3_wal_checkpoint_v2(db_, NULL, SQLITE_CHECKPOINT_FULL, NULL, NULL); + } +} + +namespace leveldb { + +// Helper for quickly generating random data. +namespace { +class RandomGenerator { + private: + std::string data_; + int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < 1048576) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +static Slice TrimSpace(Slice s) { + int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); +} + +} + +class Benchmark { + private: + sqlite3* db_; + int db_num_; + int num_; + int reads_; + double start_; + double last_op_finish_; + int64_t bytes_; + std::string message_; + Histogram hist_; + RandomGenerator gen_; + Random rand_; + + // State kept for progress messages + int done_; + int next_report_; // When to report next + + void PrintHeader() { + const int kKeySize = 16; + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", kKeySize); + fprintf(stdout, "Values: %d bytes each\n", FLAGS_value_size); + fprintf(stdout, "Entries: %d\n", num_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast(kKeySize + FLAGS_value_size) * num_) + / 1048576.0)); + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + } + + void PrintEnvironment() { + fprintf(stderr, "SQLite: version %s\n", SQLITE_VERSION); + +#if defined(__linux) + time_t now = time(NULL); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != NULL) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != NULL) { + const char* sep = strchr(line, ':'); + if (sep == NULL) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + + void Start() { + start_ = Env::Default()->NowMicros() * 1e-6; + bytes_ = 0; + message_.clear(); + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + next_report_ = 100; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = Env::Default()->NowMicros() * 1e-6; + double micros = (now - last_op_finish_) * 1e6; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stderr, "... finished %d ops%30s\r", done_, ""); + fflush(stderr); + } + } + + void Stop(const Slice& name) { + double finish = Env::Default()->NowMicros() * 1e-6; + + // Pretend at least one op was done in case we are running a benchmark + // that does not call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + if (bytes_ > 0) { + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / (finish - start_)); + if (!message_.empty()) { + message_ = std::string(rate) + " " + message_; + } else { + message_ = rate; + } + } + + fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", + name.ToString().c_str(), + (finish - start_) * 1e6 / done_, + (message_.empty() ? "" : " "), + message_.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } + + public: + enum Order { + SEQUENTIAL, + RANDOM + }; + enum DBState { + FRESH, + EXISTING + }; + + Benchmark() + : db_(NULL), + db_num_(0), + num_(FLAGS_num), + reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), + bytes_(0), + rand_(301) { + std::vector files; + Env::Default()->GetChildren("/tmp", &files); + if (!FLAGS_use_existing_db) { + for (int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("dbbench_sqlite3")) { + Env::Default()->DeleteFile("/tmp/" + files[i]); + } + } + } + } + + ~Benchmark() { + int status = sqlite3_close(db_); + ErrorCheck(status); + } + + void Run() { + PrintHeader(); + Open(); + + const char* benchmarks = FLAGS_benchmarks; + while (benchmarks != NULL) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == NULL) { + name = benchmarks; + benchmarks = NULL; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + bytes_ = 0; + Start(); + + bool known = true; + bool write_sync = false; + if (name == Slice("fillseq")) { + Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); + WalCheckpoint(db_); + } else if (name == Slice("fillseqbatch")) { + Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000); + WalCheckpoint(db_); + } else if (name == Slice("fillrandom")) { + Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1); + WalCheckpoint(db_); + } else if (name == Slice("fillrandbatch")) { + Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1000); + WalCheckpoint(db_); + } else if (name == Slice("overwrite")) { + Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1); + WalCheckpoint(db_); + } else if (name == Slice("overwritebatch")) { + Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1000); + WalCheckpoint(db_); + } else if (name == Slice("fillrandsync")) { + write_sync = true; + Write(write_sync, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); + WalCheckpoint(db_); + } else if (name == Slice("fillseqsync")) { + write_sync = true; + Write(write_sync, SEQUENTIAL, FRESH, num_ / 100, FLAGS_value_size, 1); + WalCheckpoint(db_); + } else if (name == Slice("fillrand100K")) { + Write(write_sync, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); + WalCheckpoint(db_); + } else if (name == Slice("fillseq100K")) { + Write(write_sync, SEQUENTIAL, FRESH, num_ / 1000, 100 * 1000, 1); + WalCheckpoint(db_); + } else if (name == Slice("readseq")) { + Read(SEQUENTIAL, 1); + } else if (name == Slice("readrandom")) { + Read(RANDOM, 1); + } else if (name == Slice("readrand100K")) { + int n = reads_; + reads_ /= 1000; + Read(RANDOM, 1); + reads_ = n; + } else if (name == Slice("readseq100K")) { + int n = reads_; + reads_ /= 1000; + Read(SEQUENTIAL, 1); + reads_ = n; + } else { + known = false; + if (name != Slice()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + } + } + if (known) { + Stop(name); + } + } + } + + void Open() { + assert(db_ == NULL); + + int status; + char file_name[100]; + char* err_msg = NULL; + db_num_++; + + // Open database + snprintf(file_name, sizeof(file_name), "/tmp/dbbench_sqlite3-%d.db", + db_num_); + status = sqlite3_open(file_name, &db_); + if (status) { + fprintf(stderr, "open error: %s\n", sqlite3_errmsg(db_)); + exit(1); + } + + // Change SQLite cache size + char cache_size[100]; + snprintf(cache_size, sizeof(cache_size), "PRAGMA cache_size = %d", + FLAGS_num_pages); + status = sqlite3_exec(db_, cache_size, NULL, NULL, &err_msg); + ExecErrorCheck(status, err_msg); + + // FLAGS_page_size is defaulted to 1024 + if (FLAGS_page_size != 1024) { + char page_size[100]; + snprintf(page_size, sizeof(page_size), "PRAGMA page_size = %d", + FLAGS_page_size); + status = sqlite3_exec(db_, page_size, NULL, NULL, &err_msg); + ExecErrorCheck(status, err_msg); + } + + // Change journal mode to WAL if WAL enabled flag is on + if (FLAGS_WAL_enabled) { + std::string WAL_stmt = "PRAGMA journal_mode = WAL"; + status = sqlite3_exec(db_, WAL_stmt.c_str(), NULL, NULL, &err_msg); + ExecErrorCheck(status, err_msg); + } + + // Change locking mode to exclusive and create tables/index for database + std::string locking_stmt = "PRAGMA locking_mode = EXCLUSIVE"; + std::string create_stmt = + "CREATE TABLE test (key blob, value blob, PRIMARY KEY(key))"; + std::string index_stmt = "CREATE INDEX keyindex ON test (key)"; + std::string stmt_array[] = { locking_stmt, create_stmt, index_stmt }; + int stmt_array_length = sizeof(stmt_array) / sizeof(std::string); + for (int i = 0; i < stmt_array_length; i++) { + status = sqlite3_exec(db_, stmt_array[i].c_str(), NULL, NULL, &err_msg); + ExecErrorCheck(status, err_msg); + } + } + + void Write(bool write_sync, Order order, DBState state, + int num_entries, int value_size, int entries_per_batch) { + // Create new database if state == FRESH + if (state == FRESH) { + if (FLAGS_use_existing_db) { + message_ = "skipping (--use_existing_db is true)"; + return; + } + sqlite3_close(db_); + db_ = NULL; + Open(); + Start(); + } + + if (num_entries != num_) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%d ops)", num_entries); + message_ = msg; + } + + char* err_msg = NULL; + int status; + + sqlite3_stmt *replace_stmt, *begin_trans_stmt, *end_trans_stmt; + std::string replace_str = "REPLACE INTO test (key, value) VALUES (?, ?)"; + std::string begin_trans_str = "BEGIN TRANSACTION;"; + std::string end_trans_str = "END TRANSACTION;"; + + // Check for synchronous flag in options + std::string sync_stmt = (write_sync) ? "PRAGMA synchronous = FULL" : + "PRAGMA synchronous = OFF"; + status = sqlite3_exec(db_, sync_stmt.c_str(), NULL, NULL, &err_msg); + ExecErrorCheck(status, err_msg); + + // Preparing sqlite3 statements + status = sqlite3_prepare_v2(db_, replace_str.c_str(), -1, + &replace_stmt, NULL); + ErrorCheck(status); + status = sqlite3_prepare_v2(db_, begin_trans_str.c_str(), -1, + &begin_trans_stmt, NULL); + ErrorCheck(status); + status = sqlite3_prepare_v2(db_, end_trans_str.c_str(), -1, + &end_trans_stmt, NULL); + ErrorCheck(status); + + bool transaction = (entries_per_batch > 1); + for (int i = 0; i < num_entries; i += entries_per_batch) { + // Begin write transaction + if (FLAGS_transaction && transaction) { + status = sqlite3_step(begin_trans_stmt); + StepErrorCheck(status); + status = sqlite3_reset(begin_trans_stmt); + ErrorCheck(status); + } + + // Create and execute SQL statements + for (int j = 0; j < entries_per_batch; j++) { + const char* value = gen_.Generate(value_size).data(); + + // Create values for key-value pair + const int k = (order == SEQUENTIAL) ? i + j : + (rand_.Next() % num_entries); + char key[100]; + snprintf(key, sizeof(key), "%016d", k); + + // Bind KV values into replace_stmt + status = sqlite3_bind_blob(replace_stmt, 1, key, 16, SQLITE_STATIC); + ErrorCheck(status); + status = sqlite3_bind_blob(replace_stmt, 2, value, + value_size, SQLITE_STATIC); + ErrorCheck(status); + + // Execute replace_stmt + bytes_ += value_size + strlen(key); + status = sqlite3_step(replace_stmt); + StepErrorCheck(status); + + // Reset SQLite statement for another use + status = sqlite3_clear_bindings(replace_stmt); + ErrorCheck(status); + status = sqlite3_reset(replace_stmt); + ErrorCheck(status); + + FinishedSingleOp(); + } + + // End write transaction + if (FLAGS_transaction && transaction) { + status = sqlite3_step(end_trans_stmt); + StepErrorCheck(status); + status = sqlite3_reset(end_trans_stmt); + ErrorCheck(status); + } + } + + status = sqlite3_finalize(replace_stmt); + ErrorCheck(status); + status = sqlite3_finalize(begin_trans_stmt); + ErrorCheck(status); + status = sqlite3_finalize(end_trans_stmt); + ErrorCheck(status); + } + + void Read(Order order, int entries_per_batch) { + int status; + sqlite3_stmt *read_stmt, *begin_trans_stmt, *end_trans_stmt; + + std::string read_str = "SELECT * FROM test WHERE key = ?"; + std::string begin_trans_str = "BEGIN TRANSACTION;"; + std::string end_trans_str = "END TRANSACTION;"; + + // Preparing sqlite3 statements + status = sqlite3_prepare_v2(db_, begin_trans_str.c_str(), -1, + &begin_trans_stmt, NULL); + ErrorCheck(status); + status = sqlite3_prepare_v2(db_, end_trans_str.c_str(), -1, + &end_trans_stmt, NULL); + ErrorCheck(status); + status = sqlite3_prepare_v2(db_, read_str.c_str(), -1, &read_stmt, NULL); + ErrorCheck(status); + + bool transaction = (entries_per_batch > 1); + for (int i = 0; i < reads_; i += entries_per_batch) { + // Begin read transaction + if (FLAGS_transaction && transaction) { + status = sqlite3_step(begin_trans_stmt); + StepErrorCheck(status); + status = sqlite3_reset(begin_trans_stmt); + ErrorCheck(status); + } + + // Create and execute SQL statements + for (int j = 0; j < entries_per_batch; j++) { + // Create key value + char key[100]; + int k = (order == SEQUENTIAL) ? i + j : (rand_.Next() % reads_); + snprintf(key, sizeof(key), "%016d", k); + + // Bind key value into read_stmt + status = sqlite3_bind_blob(read_stmt, 1, key, 16, SQLITE_STATIC); + ErrorCheck(status); + + // Execute read statement + while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW); + StepErrorCheck(status); + + // Reset SQLite statement for another use + status = sqlite3_clear_bindings(read_stmt); + ErrorCheck(status); + status = sqlite3_reset(read_stmt); + ErrorCheck(status); + FinishedSingleOp(); + } + + // End read transaction + if (FLAGS_transaction && transaction) { + status = sqlite3_step(end_trans_stmt); + StepErrorCheck(status); + status = sqlite3_reset(end_trans_stmt); + ErrorCheck(status); + } + } + + status = sqlite3_finalize(read_stmt); + ErrorCheck(status); + status = sqlite3_finalize(begin_trans_stmt); + ErrorCheck(status); + status = sqlite3_finalize(end_trans_stmt); + ErrorCheck(status); + } + +}; + +} + +int main(int argc, char** argv) { + for (int i = 1; i < argc; i++) { + double d; + int n; + char junk; + if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { + FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); + } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_histogram = n; + } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { + FLAGS_compression_ratio = d; + } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_existing_db = n; + } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { + FLAGS_num = n; + } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { + FLAGS_reads = n; + } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { + FLAGS_value_size = n; + } else if (leveldb::Slice(argv[i]) == leveldb::Slice("--no_transaction")) { + FLAGS_transaction = false; + } else if (sscanf(argv[i], "--page_size=%d%c", &n, &junk) == 1) { + FLAGS_page_size = n; + } else if (sscanf(argv[i], "--num_pages=%d%c", &n, &junk) == 1) { + FLAGS_num_pages = n; + } else if (sscanf(argv[i], "--WAL_enabled=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_WAL_enabled = n; + } else { + fprintf(stderr, "Invalid flag '%s'\n", argv[i]); + exit(1); + } + } + + leveldb::Benchmark benchmark; + benchmark.Run(); + return 0; +} diff --git a/doc/bench/db_bench_tree_db.cc b/doc/bench/db_bench_tree_db.cc new file mode 100644 index 0000000..d42e306 --- /dev/null +++ b/doc/bench/db_bench_tree_db.cc @@ -0,0 +1,506 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "util/histogram.h" +#include "util/random.h" +#include "util/testutil.h" + +// Comma-separated list of operations to run in the specified order +// Actual benchmarks: +// +// fillseq -- write N values in sequential key order in async mode +// fillrandom -- write N values in random key order in async mode +// overwrite -- overwrite N values in random key order in async mode +// fillseqsync -- write N/100 values in sequential key order in sync mode +// fillrandsync -- write N/100 values in random key order in sync mode +// fillrand100K -- write N/1000 100K values in random order in async mode +// fillseq100K -- write N/1000 100K values in seq order in async mode +// readseq -- read N times sequentially +// readseq100K -- read N/1000 100K values in sequential order in async mode +// readrand100K -- read N/1000 100K values in sequential order in async mode +// readrandom -- read N times in random order +static const char* FLAGS_benchmarks = + "fillseq," + "fillseqsync," + "fillrandsync," + "fillrandom," + "overwrite," + "readrandom," + "readseq," + "fillrand100K," + "fillseq100K," + "readseq100K," + "readrand100K," + ; + +// Number of key/values to place in database +static int FLAGS_num = 1000000; + +// Number of read operations to do. If negative, do FLAGS_num reads. +static int FLAGS_reads = -1; + +// Size of each value +static int FLAGS_value_size = 100; + +// Arrange to generate values that shrink to this fraction of +// their original size after compression +static double FLAGS_compression_ratio = 0.5; + +// Print histogram of operation timings +static bool FLAGS_histogram = false; + +// Cache size. Default 4 MB +static int FLAGS_cache_size = 4194304; + +// Page size. Default 1 KB +static int FLAGS_page_size = 1024; + +// If true, do not destroy the existing database. If you set this +// flag and also specify a benchmark that wants a fresh database, that +// benchmark will fail. +static bool FLAGS_use_existing_db = false; + +// Compression flag. If true, compression is on. If false, compression +// is off. +static bool FLAGS_compression = true; + +inline +static void DBSynchronize(kyotocabinet::TreeDB* db_) +{ + // Synchronize will flush writes to disk + if (!db_->synchronize()) { + fprintf(stderr, "synchronize error: %s\n", db_->error().name()); + } +} + +namespace leveldb { + +// Helper for quickly generating random data. +namespace { +class RandomGenerator { + private: + std::string data_; + int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < 1048576) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +static Slice TrimSpace(Slice s) { + int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); +} + +} + +class Benchmark { + private: + kyotocabinet::TreeDB* db_; + int db_num_; + int num_; + int reads_; + double start_; + double last_op_finish_; + int64_t bytes_; + std::string message_; + Histogram hist_; + RandomGenerator gen_; + Random rand_; + kyotocabinet::LZOCompressor comp_; + + // State kept for progress messages + int done_; + int next_report_; // When to report next + + void PrintHeader() { + const int kKeySize = 16; + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", kKeySize); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %d\n", num_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast(kKeySize + FLAGS_value_size) * num_) + / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) + / 1048576.0)); + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + } + + void PrintEnvironment() { + fprintf(stderr, "Kyoto Cabinet: version %s, lib ver %d, lib rev %d\n", + kyotocabinet::VERSION, kyotocabinet::LIBVER, kyotocabinet::LIBREV); + +#if defined(__linux) + time_t now = time(NULL); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != NULL) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != NULL) { + const char* sep = strchr(line, ':'); + if (sep == NULL) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + + void Start() { + start_ = Env::Default()->NowMicros() * 1e-6; + bytes_ = 0; + message_.clear(); + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + next_report_ = 100; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = Env::Default()->NowMicros() * 1e-6; + double micros = (now - last_op_finish_) * 1e6; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stderr, "... finished %d ops%30s\r", done_, ""); + fflush(stderr); + } + } + + void Stop(const Slice& name) { + double finish = Env::Default()->NowMicros() * 1e-6; + + // Pretend at least one op was done in case we are running a benchmark + // that does not call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + if (bytes_ > 0) { + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / (finish - start_)); + if (!message_.empty()) { + message_ = std::string(rate) + " " + message_; + } else { + message_ = rate; + } + } + + fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", + name.ToString().c_str(), + (finish - start_) * 1e6 / done_, + (message_.empty() ? "" : " "), + message_.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } + + public: + enum Order { + SEQUENTIAL, + RANDOM + }; + enum DBState { + FRESH, + EXISTING + }; + + Benchmark() + : db_(NULL), + num_(FLAGS_num), + reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), + bytes_(0), + rand_(301) { + std::vector files; + Env::Default()->GetChildren("/tmp", &files); + if (!FLAGS_use_existing_db) { + for (int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("dbbench_polyDB")) { + Env::Default()->DeleteFile("/tmp/" + files[i]); + } + } + } + } + + ~Benchmark() { + if (!db_->close()) { + fprintf(stderr, "close error: %s\n", db_->error().name()); + } + } + + void Run() { + PrintHeader(); + Open(false); + + const char* benchmarks = FLAGS_benchmarks; + while (benchmarks != NULL) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == NULL) { + name = benchmarks; + benchmarks = NULL; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + Start(); + + bool known = true; + bool write_sync = false; + if (name == Slice("fillseq")) { + Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); + + } else if (name == Slice("fillrandom")) { + Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1); + DBSynchronize(db_); + } else if (name == Slice("overwrite")) { + Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1); + DBSynchronize(db_); + } else if (name == Slice("fillrandsync")) { + write_sync = true; + Write(write_sync, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); + DBSynchronize(db_); + } else if (name == Slice("fillseqsync")) { + write_sync = true; + Write(write_sync, SEQUENTIAL, FRESH, num_ / 100, FLAGS_value_size, 1); + DBSynchronize(db_); + } else if (name == Slice("fillrand100K")) { + Write(write_sync, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); + DBSynchronize(db_); + } else if (name == Slice("fillseq100K")) { + Write(write_sync, SEQUENTIAL, FRESH, num_ / 1000, 100 * 1000, 1); + DBSynchronize(db_); + } else if (name == Slice("readseq")) { + ReadSequential(); + } else if (name == Slice("readrandom")) { + ReadRandom(); + } else if (name == Slice("readrand100K")) { + int n = reads_; + reads_ /= 1000; + ReadRandom(); + reads_ = n; + } else if (name == Slice("readseq100K")) { + int n = reads_; + reads_ /= 1000; + ReadSequential(); + reads_ = n; + } else { + known = false; + if (name != Slice()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + } + } + if (known) { + Stop(name); + } + } + } + + private: + void Open(bool sync) { + assert(db_ == NULL); + + // Initialize db_ + db_ = new kyotocabinet::TreeDB(); + char file_name[100]; + db_num_++; + snprintf(file_name, sizeof(file_name), "/tmp/dbbench_polyDB-%d.kct", + db_num_); + + // Create tuning options and open the database + int open_options = kyotocabinet::PolyDB::OWRITER | + kyotocabinet::PolyDB::OCREATE; + int tune_options = kyotocabinet::TreeDB::TSMALL | + kyotocabinet::TreeDB::TLINEAR; + if (FLAGS_compression) { + tune_options |= kyotocabinet::TreeDB::TCOMPRESS; + db_->tune_compressor(&comp_); + } + db_->tune_options(tune_options); + db_->tune_page_cache(FLAGS_cache_size); + db_->tune_page(FLAGS_page_size); + db_->tune_map(256LL<<20); + if (sync) { + open_options |= kyotocabinet::PolyDB::OAUTOSYNC; + } + if (!db_->open(file_name, open_options)) { + fprintf(stderr, "open error: %s\n", db_->error().name()); + } + } + + void Write(bool sync, Order order, DBState state, + int num_entries, int value_size, int entries_per_batch) { + // Create new database if state == FRESH + if (state == FRESH) { + if (FLAGS_use_existing_db) { + message_ = "skipping (--use_existing_db is true)"; + return; + } + delete db_; + db_ = NULL; + Open(sync); + Start(); // Do not count time taken to destroy/open + } + + if (num_entries != num_) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%d ops)", num_entries); + message_ = msg; + } + + // Write to database + for (int i = 0; i < num_entries; i++) + { + const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % num_entries); + char key[100]; + snprintf(key, sizeof(key), "%016d", k); + bytes_ += value_size + strlen(key); + std::string cpp_key = key; + if (!db_->set(cpp_key, gen_.Generate(value_size).ToString())) { + fprintf(stderr, "set error: %s\n", db_->error().name()); + } + FinishedSingleOp(); + } + } + + void ReadSequential() { + kyotocabinet::DB::Cursor* cur = db_->cursor(); + cur->jump(); + std::string ckey, cvalue; + while (cur->get(&ckey, &cvalue, true)) { + bytes_ += ckey.size() + cvalue.size(); + FinishedSingleOp(); + } + delete cur; + } + + void ReadRandom() { + std::string value; + for (int i = 0; i < reads_; i++) { + char key[100]; + const int k = rand_.Next() % reads_; + snprintf(key, sizeof(key), "%016d", k); + db_->get(key, &value); + FinishedSingleOp(); + } + } +}; + +} + +int main(int argc, char** argv) { + for (int i = 1; i < argc; i++) { + double d; + int n; + char junk; + if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { + FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); + } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { + FLAGS_compression_ratio = d; + } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_histogram = n; + } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { + FLAGS_num = n; + } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { + FLAGS_reads = n; + } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { + FLAGS_value_size = n; + } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { + FLAGS_cache_size = n; + } else if (sscanf(argv[i], "--page_size=%d%c", &n, &junk) == 1) { + FLAGS_page_size = n; + } else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_compression = (n == 1) ? true : false; + } else { + fprintf(stderr, "Invalid flag '%s'\n", argv[i]); + exit(1); + } + } + + leveldb::Benchmark benchmark; + benchmark.Run(); + return 0; +} diff --git a/doc/benchmark.html b/doc/benchmark.html new file mode 100644 index 0000000..f07c04b --- /dev/null +++ b/doc/benchmark.html @@ -0,0 +1,466 @@ + + +LevelDB Benchmarks + + + + +

LevelDB Benchmarks

+

Google, July 2011

+
+ +

In order to test LevelDB's performance, we benchmark it against other well-established database implementations. We compare LevelDB (revision 39) against SQLite3 (version 3.7.6.3) and Kyoto Cabinet's (version 1.2.67) TreeDB (a B+Tree based key-value store). We would like to acknowledge Scott Hess and Mikio Hirabayashi for their suggestions and contributions to the SQLite3 and Kyoto Cabinet benchmarks, respectively.

+ +

Benchmarks were all performed on a six-core Intel(R) Xeon(R) CPU X5650 @ 2.67GHz, with 12288 KB of total L3 cache and 12 GB of DDR3 RAM at 1333 MHz. (Note that LevelDB uses at most two CPUs since the benchmarks are single threaded: one to run the benchmark, and one for background compactions.) We ran the benchmarks on two machines (with identical processors), one with an Ext3 file system and one with an Ext4 file system. The machine with the Ext3 file system has a SATA Hitachi HDS721050CLA362 hard drive. The machine with the Ext4 file system has a SATA Samsung HD502HJ hard drive. Both hard drives spin at 7200 RPM. The numbers reported below are the median of three measurements.

+ +

Benchmark Source Code

+

We wrote benchmark tools for SQLite and Kyoto TreeDB based on LevelDB's db_bench. The code for each of the benchmarks resides here:

+ + +

Custom Build Specifications

+
    +
  • LevelDB: LevelDB was compiled with the tcmalloc library and the Snappy compression library. Assertions were disabled.
  • +
  • TreeDB: TreeDB was compiled using the LZO compression library. Furthermore, we enabled the TSMALL and TLINEAR options when opening the database in order to reduce the footprint of each record.
  • +
  • SQLite: We tuned SQLite's performance, by setting its locking mode to exclusive. We left SQLite's write-ahead logging disabled since that is the default configuration. (Enabling write-ahead-logging improves SQLite's write performance by roughly 30%, but the character of the comparisons below does not change significantly.)
  • +
+ +

1. Baseline Performance

+

This section gives the baseline performance of a all of the +databases. Following sections show how performance changes as various +parameters are varied. For the baseline:

+
    +
  • Each database is allowed 4 MB of cache memory.
  • +
  • Databases are opened in asynchronous write mode. + (LevelDB's sync option, TreeDB's OAUTOSYNC option, and + SQLite3's synchronous options are all turned off). I.e., + every write is pushed to the operating system, but the + benchmark does not wait for the write to reach the disk.
  • +
  • Keys are 16 bytes each.
  • +
  • Value are 100 bytes each (with enough redundancy so that + a simple compressor shrinks them to 50% of their original + size).
  • +
  • Sequential reads/writes traverse the key space in increasing order.
  • +
  • Random reads/writes traverse the key space in random order.
  • +
+ +

A. Sequential Reads

+ + + + + + + + + + +
LevelDB4,030,000 ops/sec
 
Kyoto TreeDB1,010,000 ops/sec
 
SQLite3186,000 ops/sec
 
+

B. Random Reads

+ + + + + + + + + + +
LevelDB129,000 ops/sec
 
Kyoto TreeDB151,000 ops/sec
 
SQLite3146,000 ops/sec
 
+

C. Sequential Writes

+ + + + + + + + + + +
LevelDB779,000 ops/sec
 
Kyoto TreeDB342,000 ops/sec
 
SQLite326,900 ops/sec
 
+

D. Random Writes

+ + + + + + + + + + +
LevelDB164,000 ops/sec
 
Kyoto TreeDB88,500 ops/sec
 
SQLite3420 ops/sec
 
+ +

LevelDB outperforms both SQLite3 and TreeDB in sequential and random write operations and sequential read operations. Kyoto Cabinet has the fastest random read operations.

+ +

2. Write Performance under Different Configurations

+

A. Large Values

+

For this benchmark, we start with an empty database, and write 100,000 byte values (~50% compressible). To keep the benchmark running time reasonable, we stop after writing 1000 values.

+

Sequential Writes

+ + + + + + + + + + + + +
LevelDB1,060 ops/sec
 
+
(1.17x baseline)
Kyoto TreeDB1,020 ops/sec
 
(2.57x baseline)
SQLite32,910 ops/sec
 
(93.3x baseline)
+

Random Writes

+ + + + + + + + + + + + + +
LevelDB480 ops/sec
 
(2.52x baseline)
Kyoto TreeDB1,100 ops/sec
 
(10.72x baseline)
SQLite32,200 ops/sec
 
(4,516x baseline)
+

LevelDB doesn't perform as well with large values of 100,000 bytes each. This is because LevelDB writes keys and values at least twice: first time to the transaction log, and second time (during a compaction) to a sorted file. +With larger values, LevelDB's per-operation efficiency is swamped by the +cost of extra copies of large values.

+

B. Batch Writes

+

A batch write is a set of writes that are applied atomically to the underlying database. A single batch of N writes may be significantly faster than N individual writes. The following benchmark writes one thousand batches where each batch contains one thousand 100-byte values. TreeDB does not support batch writes and is omitted from this benchmark.

+

Sequential Writes

+ + + + + + + + + +
LevelDB840,000 entries/sec
 
(1.08x baseline)
SQLite3100,000 entries/sec
 
(3.72x baseline)
+

Random Writes

+ + + + + + + + + +
LevelDB221,000 entries/sec
 
(1.35x baseline)
SQLite31,000 entries/sec
 
(2.38x baseline)
+ +

Because of the way LevelDB persistent storage is organized, batches of +random writes are not much slower (only a factor of 4x) than batches +of sequential writes. However SQLite3 sees a significant slowdown +(factor of 100x) when switching from sequential to random batch +writes. This is because each random batch write in SQLite3 has to +update approximately as many pages as there are keys in the batch.

+ +

C. Synchronous writes

+

In the following benchmark, we enable the synchronous writing modes +of all of the databases. Since this change significantly slows down the +benchmark, we stop after 10,000 writes.

+
    +
  • For LevelDB, we set WriteOptions.sync = true.
  • +
  • In TreeDB, we enabled TreeDB's OAUTOSYNC option.
  • +
  • For SQLite3, we set "PRAGMA synchronous = FULL".
  • +
+

Sequential Writes

+ + + + + + + + + + + + + +
LevelDB2,400 ops/sec
 
(0.003x baseline)
Kyoto TreeDB140 ops/sec
 
(0.0004x baseline)
SQLite3430 ops/sec
 
(0.016x baseline)
+

Random Writes

+ + + + + + + + + + + + + +
LevelDB2,400 ops/sec
 
(0.015x baseline)
Kyoto TreeDB100 ops/sec
 
(0.001x baseline)
SQLite3110 ops/sec
 
(0.26x baseline)
+ +

Also see the ext4 performance numbers below +since synchronous writes behave significantly differently +on ext3 and ext4.

+ +

D. Turning Compression Off

+ +

In the baseline measurements, LevelDB and TreeDB were using +light-weight compression +(Snappy for LevelDB, +and LZO for +TreeDB). SQLite3, by default does not use compression. The +experiments below show what happens when compression is disabled in +all of the databases (the SQLite3 numbers are just a copy of +its baseline measurements):

+ +

Sequential Writes

+ + + + + + + + + + + + + +
LevelDB594,000 ops/sec
 
(0.76x baseline)
Kyoto TreeDB485,000 ops/sec
 
(1.42x baseline)
SQLite326,900 ops/sec
 
(1.00x baseline)
+

Random Writes

+ + + + + + + + + + + + + +
LevelDB135,000 ops/sec
 
(0.82x baseline)
Kyoto TreeDB159,000 ops/sec
 
(1.80x baseline)
SQLite3420 ops/sec
 
(1.00x baseline)
+ +

LevelDB's write performance is better with compression than without +since compression decreases the amount of data that has to be written +to disk. Therefore LevelDB users can leave compression enabled in +most scenarios without having worry about a tradeoff between space +usage and performance. TreeDB's performance on the other hand is +better without compression than with compression. Presumably this is +because TreeDB's compression library (LZO) is more expensive than +LevelDB's compression library (Snappy).

+ +

E. Using more memory

+

We increased the overall cache size for each database to 128 MB. For LevelDB, we partitioned 128 MB into a 120 MB write buffer and 8 MB of cache (up from 2 MB of write buffer and 2 MB of cache). For SQLite3, we kept the page size at 1024 bytes, but increased the number of pages to 131,072 (up from 4096). For TreeDB, we also kept the page size at 1024 bytes, but increased the cache size to 128 MB (up from 4 MB).

+

Sequential Writes

+ + + + + + + + + + + + + +
LevelDB812,000 ops/sec
 
(1.04x baseline)
Kyoto TreeDB321,000 ops/sec
 
(0.94x baseline)
SQLite326,200 ops/sec
 
(0.97x baseline)
+

Random Writes

+ + + + + + + + + + + + + +
LevelDB355,000 ops/sec
 
(2.16x baseline)
Kyoto TreeDB284,000 ops/sec
 
(3.21x baseline)
SQLite3450 ops/sec
 
(1.07x baseline)
+ +

SQLite's performance does not change substantially when compared to +the baseline, but the random write performance for both LevelDB and +TreeDB increases significantly. LevelDB's performance improves +because a larger write buffer reduces the need to merge sorted files +(since it creates a smaller number of larger sorted files). TreeDB's +performance goes up because the entire database is available in memory +for fast in-place updates.

+ +

2. Read Performance under Different Configurations

+

A. Larger caches

+

We increased the overall memory usage to 128 MB for each database. +For LevelDB, we allocated 8 MB to LevelDB's write buffer and 120 MB +to LevelDB's cache. The other databases don't differentiate between a +write buffer and a cache, so we simply set their cache size to 128 +MB.

+

Sequential Reads

+ + + + + + + + + + + + + +
LevelDB5,210,000 ops/sec
 
(1.29x baseline)
Kyoto TreeDB1,070,000 ops/sec
 
(1.06x baseline)
SQLite3221,000 ops/sec
 
(1.19x baseline)
+ +

Random Reads

+ + + + + + + + + + + + + +
LevelDB190,000 ops/sec
 
(1.47x baseline)
Kyoto TreeDB463,000 ops/sec
 
(3.07x baseline)
SQLite3197,000 ops/sec
 
(1.35x baseline)
+ +

As expected, the read performance of all of the databases increases +when the caches are enlarged. In particular, TreeDB seems to make +very effective use of a cache that is large enough to hold the entire +database.

+ +

B. No compression reads

+

For this benchmark, we populated a database with 1 million entries consisting of 16 byte keys and 100 byte values. We compiled LevelDB and Kyoto Cabinet without compression support, so results that are read out from the database are already uncompressed. We've listed the SQLite3 baseline read performance as a point of comparison.

+

Sequential Reads

+ + + + + + + + + + + + + +
LevelDB4,880,000 ops/sec
 
(1.21x baseline)
Kyoto TreeDB1,230,000 ops/sec
 
(3.60x baseline)
SQLite3186,000 ops/sec
 
(1.00x baseline)
+

Random Reads

+ + + + + + + + + + + + + +
LevelDB149,000 ops/sec
 
(1.16x baseline)
Kyoto TreeDB175,000 ops/sec
 
(1.16x baseline)
SQLite3146,000 ops/sec
 
(1.00x baseline)
+ +

Performance of both LevelDB and TreeDB improves a small amount when +compression is disabled. Note however that under different workloads, +performance may very well be better with compression if it allows more +of the working set to fit in memory.

+ +

Note about Ext4 Filesystems

+

The preceding numbers are for an ext3 file system. Synchronous writes are much slower under ext4 (LevelDB drops to ~34 writes / second, TreeDB drops to ~5 writes / second; SQLite3 drops to ~24 writes / second) due to ext4's different handling of fsync / msync calls. Even LevelDB's asynchronous write performance drops somewhat since it spreads its storage across multiple files and issues fsync calls when switching to a new file.

+ +

Acknowledgements

+

Jeff Dean and Sanjay Ghemawat wrote LevelDB. Kevin Tseng wrote and compiled these benchmarks. Mikio Hirabayashi, Scott Hess, and Gabor Cselle provided help and advice.

+ + diff --git a/leveldb.gyp b/leveldb.gyp deleted file mode 100644 index ea634a2..0000000 --- a/leveldb.gyp +++ /dev/null @@ -1,325 +0,0 @@ -# Copyright (c) 2011 The LevelDB Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. See the AUTHORS file for names of contributors. - -{ - 'variables': { - 'use_snappy%': 0, - }, - 'target_defaults': { - 'defines': [ - 'LEVELDB_PLATFORM_CHROMIUM=1', - ], - 'include_dirs': [ - '.', - 'include/', - ], - 'conditions': [ - ['OS == "win"', { - 'include_dirs': [ - 'port/win', - ], - }], - ['use_snappy', { - 'defines': [ - 'USE_SNAPPY=1', - ], - }], - ], - }, - 'targets': [ - { - 'target_name': 'leveldb', - 'type': '<(library)', - 'dependencies': [ - # The base libary is a lightweight abstraction layer for things like - # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/ - '../../base/base.gyp:base', - # base::LazyInstance is a template that pulls in dynamic_annotations so - # we need to explictly link in the code for dynamic_annotations. - '../../base/third_party/dynamic_annotations/dynamic_annotations.gyp:dynamic_annotations', - ], - 'conditions': [ - ['use_snappy', { - 'dependencies': [ - '../../third_party/snappy/snappy.gyp:snappy', - ], - }], - ], - 'direct_dependent_settings': { - 'include_dirs': [ - 'include/', - ], - 'conditions': [ - ['OS == "win"', { - 'include_dirs': [ - 'port/win', - ], - }], - ], - }, - 'sources': [ - # Include and then exclude so that all files show up in IDEs, even if - # they don't build. - 'db/builder.cc', - 'db/builder.h', - 'db/db_impl.cc', - 'db/db_impl.h', - 'db/db_iter.cc', - 'db/db_iter.h', - 'db/filename.cc', - 'db/filename.h', - 'db/dbformat.cc', - 'db/dbformat.h', - 'db/log_format.h', - 'db/log_reader.cc', - 'db/log_reader.h', - 'db/log_writer.cc', - 'db/log_writer.h', - 'db/memtable.cc', - 'db/memtable.h', - 'db/repair.cc', - 'db/skiplist.h', - 'db/snapshot.h', - 'db/table_cache.cc', - 'db/table_cache.h', - 'db/version_edit.cc', - 'db/version_edit.h', - 'db/version_set.cc', - 'db/version_set.h', - 'db/write_batch.cc', - 'db/write_batch_internal.h', - 'include/leveldb/cache.h', - 'include/leveldb/comparator.h', - 'include/leveldb/db.h', - 'include/leveldb/env.h', - 'include/leveldb/iterator.h', - 'include/leveldb/options.h', - 'include/leveldb/slice.h', - 'include/leveldb/status.h', - 'include/leveldb/table.h', - 'include/leveldb/table_builder.h', - 'include/leveldb/write_batch.h', - 'port/port.h', - 'port/port_chromium.cc', - 'port/port_chromium.h', - 'port/port_example.h', - 'port/port_posix.cc', - 'port/port_posix.h', - 'table/block.cc', - 'table/block.h', - 'table/block_builder.cc', - 'table/block_builder.h', - 'table/format.cc', - 'table/format.h', - 'table/iterator.cc', - 'table/iterator_wrapper.h', - 'table/merger.cc', - 'table/merger.h', - 'table/table.cc', - 'table/table_builder.cc', - 'table/two_level_iterator.cc', - 'table/two_level_iterator.h', - 'util/arena.cc', - 'util/arena.h', - 'util/cache.cc', - 'util/coding.cc', - 'util/coding.h', - 'util/comparator.cc', - 'util/crc32c.cc', - 'util/crc32c.h', - 'util/env.cc', - 'util/env_chromium.cc', - 'util/env_posix.cc', - 'util/hash.cc', - 'util/hash.h', - 'util/logging.cc', - 'util/logging.h', - 'util/mutexlock.h', - 'util/options.cc', - 'util/random.h', - 'util/status.cc', - ], - 'sources/': [ - ['exclude', '_(android|example|portable|posix)\\.cc$'], - ], - }, - { - 'target_name': 'leveldb_testutil', - 'type': '<(library)', - 'dependencies': [ - '../../base/base.gyp:base', - 'leveldb', - ], - 'export_dependent_settings': [ - # The tests use include directories from these projects. - '../../base/base.gyp:base', - 'leveldb', - ], - 'sources': [ - 'util/histogram.cc', - 'util/histogram.h', - 'util/testharness.cc', - 'util/testharness.h', - 'util/testutil.cc', - 'util/testutil.h', - ], - }, - { - 'target_name': 'leveldb_arena_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/arena_test.cc', - ], - }, - { - 'target_name': 'leveldb_cache_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/cache_test.cc', - ], - }, - { - 'target_name': 'leveldb_coding_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/coding_test.cc', - ], - }, - { - 'target_name': 'leveldb_corruption_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/corruption_test.cc', - ], - }, - { - 'target_name': 'leveldb_crc32c_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/crc32c_test.cc', - ], - }, - { - 'target_name': 'leveldb_db_bench', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/db_bench.cc', - ], - }, - { - 'target_name': 'leveldb_db_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/db_test.cc', - ], - }, - { - 'target_name': 'leveldb_dbformat_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/dbformat_test.cc', - ], - }, - { - 'target_name': 'leveldb_env_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'util/env_test.cc', - ], - }, - { - 'target_name': 'leveldb_filename_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/filename_test.cc', - ], - }, - { - 'target_name': 'leveldb_log_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/log_test.cc', - ], - }, - { - 'target_name': 'leveldb_skiplist_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/skiplist_test.cc', - ], - }, - { - 'target_name': 'leveldb_table_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'table/table_test.cc', - ], - }, - { - 'target_name': 'leveldb_version_edit_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/version_edit_test.cc', - ], - }, - { - 'target_name': 'leveldb_write_batch_test', - 'type': 'executable', - 'dependencies': [ - 'leveldb_testutil', - ], - 'sources': [ - 'db/write_batch_test.cc', - ], - }, - ], -} - -# Local Variables: -# tab-width:2 -# indent-tabs-mode:nil -# End: -# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/port/port_chromium.cc b/port/port_chromium.cc deleted file mode 100644 index 7f6de92..0000000 --- a/port/port_chromium.cc +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_chromium.h" - -#include "util/logging.h" - -#if defined(USE_SNAPPY) -# include "third_party/snappy/src/snappy.h" -#endif - -namespace leveldb { -namespace port { - -Mutex::Mutex() { -} - -Mutex::~Mutex() { -} - -void Mutex::Lock() { - mu_.Acquire(); -} - -void Mutex::Unlock() { - mu_.Release(); -} - -void Mutex::AssertHeld() { - mu_.AssertAcquired(); -} - -CondVar::CondVar(Mutex* mu) - : cv_(&mu->mu_) { -} - -CondVar::~CondVar() { } - -void CondVar::Wait() { - cv_.Wait(); -} - -void CondVar::Signal(){ - cv_.Signal(); -} - -void CondVar::SignalAll() { - cv_.Broadcast(); -} - -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output) { -#if defined(USE_SNAPPY) - output->resize(snappy::MaxCompressedLength(input_length)); - size_t outlen; - snappy::RawCompress(input, input_length, &(*output)[0], &outlen); - output->resize(outlen); - return true; -#else - return false; -#endif -} - -bool Snappy_GetUncompressedLength(const char* input, size_t length, - size_t* result) { -#if defined(USE_SNAPPY) - return snappy::GetUncompressedLength(input_data, input_length, result); -#else - return false; -#endif -} - -bool Snappy_Uncompress(const char* input_data, size_t input_length, - char* output) { -#if defined(USE_SNAPPY) - return snappy::RawUncompress(input_data, input_length, output); -#else - return false; -#endif -} - -} -} diff --git a/port/port_chromium.h b/port/port_chromium.h deleted file mode 100644 index feecd5b..0000000 --- a/port/port_chromium.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ -#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ - -#include -#include -#include -#include "base/atomicops.h" -#include "base/basictypes.h" -#include "base/logging.h" -#include "base/synchronization/condition_variable.h" -#include "base/synchronization/lock.h" - -// Linux's ThreadIdentifier() needs this. -#if defined(OS_LINUX) -# include -#endif - -#if defined(OS_WIN) -#define snprintf _snprintf -#define va_copy(a, b) do { (a) = (b); } while (0) -#endif - -namespace leveldb { -namespace port { - -// Chromium only supports little endian. -static const bool kLittleEndian = true; - -class Mutex { - public: - Mutex(); - ~Mutex(); - void Lock(); - void Unlock(); - void AssertHeld(); - - private: - base::Lock mu_; - - friend class CondVar; - DISALLOW_COPY_AND_ASSIGN(Mutex); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - - private: - base::ConditionVariable cv_; - - DISALLOW_COPY_AND_ASSIGN(CondVar); -}; - -class AtomicPointer { - private: - typedef base::subtle::AtomicWord Rep; - Rep rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* p) : rep_(reinterpret_cast(p)) {} - inline void* Acquire_Load() const { - return reinterpret_cast(::base::subtle::Acquire_Load(&rep_)); - } - inline void Release_Store(void* v) { - ::base::subtle::Release_Store(&rep_, reinterpret_cast(v)); - } - inline void* NoBarrier_Load() const { - return reinterpret_cast(::base::subtle::NoBarrier_Load(&rep_)); - } - inline void NoBarrier_Store(void* v) { - ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast(v)); - } -}; - -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); -bool Snappy_GetUncompressedLength(const char* input, size_t length, - size_t* result); -bool Snappy_Uncompress(const char* input_data, size_t input_length, - char* output); - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_ diff --git a/port/port_posix.h b/port/port_posix.h index ef01de3..9666391 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -7,7 +7,7 @@ #ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ #define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ -#if defined(OS_MACOSX) +#if defined(OS_MACOSX) || defined(OS_FREEBSD) #include #elif defined(OS_SOLARIS) #include @@ -33,13 +33,13 @@ #define IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) #endif -#if defined(OS_MACOSX) || defined(OS_SOLARIS) +#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) #define fread_unlocked fread #define fwrite_unlocked fwrite #define fflush_unlocked fflush #endif -#if defined(OS_MACOSX) +#if defined(OS_MACOSX) || defined(OS_FREEBSD) #define fdatasync fsync #endif diff --git a/util/env_chromium.cc b/util/env_chromium.cc deleted file mode 100644 index 975386b..0000000 --- a/util/env_chromium.cc +++ /dev/null @@ -1,564 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include "base/at_exit.h" -#include "base/file_path.h" -#include "base/file_util.h" -#include "base/lazy_instance.h" -#include "base/memory/ref_counted.h" -#include "base/message_loop.h" -#include "base/platform_file.h" -#include "base/process_util.h" -#include "base/synchronization/lock.h" -#include "base/sys_info.h" -#include "base/task.h" -#include "base/threading/platform_thread.h" -#include "base/threading/thread.h" -#include "base/utf_string_conversions.h" -#include "leveldb/env.h" -#include "leveldb/slice.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/posix_logger.h" - -#if defined(OS_WIN) -#include -#include "base/win/win_util.h" -#endif - -#if defined(OS_MACOSX) || defined(OS_WIN) -// The following are glibc-specific -namespace { - -size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) { - return fread(ptr, size, n, file); -} - -size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) { - return fwrite(ptr, size, n, file); -} - -int fflush_unlocked(FILE *file) { - return fflush(file); -} - -int fdatasync(int fildes) { -#if defined(OS_WIN) - return _commit(fildes); -#else - return fsync(fildes); -#endif -} - -} -#endif - -namespace leveldb { - -namespace { - -class Thread; - -static const ::FilePath::CharType kLevelDBTestDirectoryPrefix[] - = FILE_PATH_LITERAL("leveldb-test-"); - -::FilePath CreateFilePath(const std::string& file_path) { -#if defined(OS_WIN) - return FilePath(UTF8ToUTF16(file_path)); -#else - return FilePath(file_path); -#endif -} - -std::string FilePathToString(const ::FilePath& file_path) { -#if defined(OS_WIN) - return UTF16ToUTF8(file_path.value()); -#else - return file_path.value(); -#endif -} - -// TODO(jorlow): This should be moved into Chromium's base. -const char* PlatformFileErrorString(const ::base::PlatformFileError& error) { - switch (error) { - case ::base::PLATFORM_FILE_ERROR_FAILED: - return "Opening file failed."; - case ::base::PLATFORM_FILE_ERROR_IN_USE: - return "File currently in use."; - case ::base::PLATFORM_FILE_ERROR_EXISTS: - return "File already exists."; - case ::base::PLATFORM_FILE_ERROR_NOT_FOUND: - return "File not found."; - case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED: - return "Access denied."; - case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED: - return "Too many files open."; - case ::base::PLATFORM_FILE_ERROR_NO_MEMORY: - return "Out of memory."; - case ::base::PLATFORM_FILE_ERROR_NO_SPACE: - return "No space left on drive."; - case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY: - return "Not a directory."; - case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION: - return "Invalid operation."; - case ::base::PLATFORM_FILE_ERROR_SECURITY: - return "Security error."; - case ::base::PLATFORM_FILE_ERROR_ABORT: - return "File operation aborted."; - case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE: - return "The supplied path was not a file."; - case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY: - return "The file was not empty."; - } - NOTIMPLEMENTED(); - return "Unknown error."; -} - -class ChromiumSequentialFile: public SequentialFile { - private: - std::string filename_; - FILE* file_; - - public: - ChromiumSequentialFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - virtual ~ChromiumSequentialFile() { fclose(file_); } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - Status s; - size_t r = fread_unlocked(scratch, 1, n, file_); - *result = Slice(scratch, r); - if (r < n) { - if (feof(file_)) { - // We leave status as ok if we hit the end of the file - } else { - // A partial read with an error: return a non-ok status - s = Status::IOError(filename_, strerror(errno)); - } - } - return s; - } - - virtual Status Skip(uint64_t n) { - if (fseek(file_, n, SEEK_CUR)) { - return Status::IOError(filename_, strerror(errno)); - } - return Status::OK(); - } -}; - -class ChromiumRandomAccessFile: public RandomAccessFile { - private: - std::string filename_; - ::base::PlatformFile file_; - - public: - ChromiumRandomAccessFile(const std::string& fname, ::base::PlatformFile file) - : filename_(fname), file_(file) { } - virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; - int r = ::base::ReadPlatformFile(file_, offset, scratch, n); - *result = Slice(scratch, (r < 0) ? 0 : r); - if (r < 0) { - // An error: return a non-ok status - s = Status::IOError(filename_, "Could not preform read"); - } - return s; - } -}; - -class ChromiumWritableFile : public WritableFile { - private: - std::string filename_; - FILE* file_; - - public: - ChromiumWritableFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } - - ~ChromiumWritableFile() { - if (file_ != NULL) { - // Ignoring any potential errors - fclose(file_); - } - } - - virtual Status Append(const Slice& data) { - size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); - Status result; - if (r != data.size()) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } - - virtual Status Close() { - Status result; - if (fclose(file_) != 0) { - result = Status::IOError(filename_, strerror(errno)); - } - file_ = NULL; - return result; - } - - virtual Status Flush() { - Status result; - if (fflush_unlocked(file_) != 0) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } - - virtual Status Sync() { - Status result; - if ((fflush_unlocked(file_) != 0) || - (fdatasync(fileno(file_)) != 0)) { - result = Status::IOError(filename_, strerror(errno)); - } - return result; - } -}; - -class ChromiumFileLock : public FileLock { - public: - ::base::PlatformFile file_; -}; - -class ChromiumEnv : public Env { - public: - ChromiumEnv(); - virtual ~ChromiumEnv() { - fprintf(stderr, "Destroying Env::Default()\n"); - exit(1); - } - - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result) { - FILE* f = fopen(fname.c_str(), "rb"); - if (f == NULL) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } else { - *result = new ChromiumSequentialFile(fname, f); - return Status::OK(); - } - } - - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result) { - int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN; - bool created; - ::base::PlatformFileError error_code; - ::base::PlatformFile file = ::base::CreatePlatformFile( - CreateFilePath(fname), flags, &created, &error_code); - if (error_code != ::base::PLATFORM_FILE_OK) { - *result = NULL; - return Status::IOError(fname, PlatformFileErrorString(error_code)); - } - *result = new ChromiumRandomAccessFile(fname, file); - return Status::OK(); - } - - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { - *result = NULL; - FILE* f = fopen(fname.c_str(), "wb"); - if (f == NULL) { - return Status::IOError(fname, strerror(errno)); - } else { - *result = new ChromiumWritableFile(fname, f); - return Status::OK(); - } - } - - virtual bool FileExists(const std::string& fname) { - return ::file_util::PathExists(CreateFilePath(fname)); - } - - virtual Status GetChildren(const std::string& dir, - std::vector* result) { - result->clear(); - ::file_util::FileEnumerator iter( - CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES); - ::FilePath current = iter.Next(); - while (!current.empty()) { - result->push_back(FilePathToString(current.BaseName())); - current = iter.Next(); - } - // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so - // we'll always return OK. Maybe manually check for error - // conditions like the file not existing? - return Status::OK(); - } - - virtual Status DeleteFile(const std::string& fname) { - Status result; - // TODO(jorlow): Should we assert this is a file? - if (!::file_util::Delete(CreateFilePath(fname), false)) { - result = Status::IOError(fname, "Could not delete file."); - } - return result; - }; - - virtual Status CreateDir(const std::string& name) { - Status result; - if (!::file_util::CreateDirectory(CreateFilePath(name))) { - result = Status::IOError(name, "Could not create directory."); - } - return result; - }; - - virtual Status DeleteDir(const std::string& name) { - Status result; - // TODO(jorlow): Should we assert this is a directory? - if (!::file_util::Delete(CreateFilePath(name), false)) { - result = Status::IOError(name, "Could not delete directory."); - } - return result; - }; - - virtual Status GetFileSize(const std::string& fname, uint64_t* size) { - Status s; - int64_t signed_size; - if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) { - *size = 0; - s = Status::IOError(fname, "Could not determine file size."); - } else { - *size = static_cast(signed_size); - } - return s; - } - - virtual Status RenameFile(const std::string& src, const std::string& dst) { - Status result; - if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) { - result = Status::IOError(src, "Could not rename file."); - } - return result; - } - - virtual Status LockFile(const std::string& fname, FileLock** lock) { - *lock = NULL; - Status result; - int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS | - ::base::PLATFORM_FILE_READ | - ::base::PLATFORM_FILE_WRITE | - ::base::PLATFORM_FILE_EXCLUSIVE_READ | - ::base::PLATFORM_FILE_EXCLUSIVE_WRITE; - bool created; - ::base::PlatformFileError error_code; - ::base::PlatformFile file = ::base::CreatePlatformFile( - CreateFilePath(fname), flags, &created, &error_code); - if (error_code != ::base::PLATFORM_FILE_OK) { - result = Status::IOError(fname, PlatformFileErrorString(error_code)); - } else { - ChromiumFileLock* my_lock = new ChromiumFileLock; - my_lock->file_ = file; - *lock = my_lock; - } - return result; - } - - virtual Status UnlockFile(FileLock* lock) { - ChromiumFileLock* my_lock = reinterpret_cast(lock); - Status result; - if (!::base::ClosePlatformFile(my_lock->file_)) { - result = Status::IOError("Could not close lock file."); - } - delete my_lock; - return result; - } - - virtual void Schedule(void (*function)(void*), void* arg); - - virtual void StartThread(void (*function)(void* arg), void* arg); - - virtual std::string UserIdentifier() { -#if defined(OS_WIN) - std::wstring user_sid; - bool ret = ::base::win::GetUserSidString(&user_sid); - DCHECK(ret); - return UTF16ToUTF8(user_sid); -#else - char buf[100]; - snprintf(buf, sizeof(buf), "%d", int(geteuid())); - return buf; -#endif - } - - virtual Status GetTestDirectory(std::string* path) { - mu_.Acquire(); - if (test_directory_.empty()) { - if (!::file_util::CreateNewTempDirectory(kLevelDBTestDirectoryPrefix, - &test_directory_)) { - mu_.Release(); - return Status::IOError("Could not create temp directory."); - } - } - *path = FilePathToString(test_directory_); - mu_.Release(); - return Status::OK(); - } - - // TODO(user,user): Use Chromium's built-in logging? - static uint64_t gettid() { - uint64_t thread_id = 0; - // Coppied from base/logging.cc. -#if defined(OS_WIN) - thread_id = GetCurrentThreadId(); -#elif defined(OS_MACOSX) - thread_id = mach_thread_self(); -#elif defined(OS_LINUX) - thread_id = syscall(__NR_gettid); -#elif defined(OS_FREEBSD) || defined(OS_NACL) - // TODO(BSD): find a better thread ID - pthread_t tid = pthread_self(); - memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid))); -#endif - return thread_id; - } - - virtual Status NewLogger(const std::string& fname, Logger** result) { - FILE* f = fopen(fname.c_str(), "w"); - if (f == NULL) { - *result = NULL; - return Status::IOError(fname, strerror(errno)); - } else { - *result = new PosixLogger(f, &ChromiumEnv::gettid); - return Status::OK(); - } - } - - virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) { - ::base::Time::Exploded t; - ::base::Time::Now().LocalExplode(&t); - return snprintf(buffer, size, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d", - t.year, - t.month, - t.day_of_month, - t.hour, - t.minute, - t.second, - static_cast(t.millisecond) * 1000); - } - - virtual uint64_t NowMicros() { - return ::base::TimeTicks::HighResNow().ToInternalValue(); - } - - virtual void SleepForMicroseconds(int micros) { - // Round up to the next millisecond. - ::base::PlatformThread::Sleep((micros + 999) / 1000); - } - - private: - // BGThread() is the body of the background thread - void BGThread(); - static void BGThreadWrapper(void* arg) { - reinterpret_cast(arg)->BGThread(); - } - - FilePath test_directory_; - - size_t page_size_; - ::base::Lock mu_; - ::base::ConditionVariable bgsignal_; - bool started_bgthread_; - - // Entry per Schedule() call - struct BGItem { void* arg; void (*function)(void*); }; - typedef std::deque BGQueue; - BGQueue queue_; -}; - -ChromiumEnv::ChromiumEnv() - : page_size_(::base::SysInfo::VMAllocationGranularity()), - bgsignal_(&mu_), - started_bgthread_(false) { -#if defined(OS_MACOSX) - ::base::EnableTerminationOnHeapCorruption(); - ::base::EnableTerminationOnOutOfMemory(); -#endif // OS_MACOSX -} - -class Thread : public ::base::PlatformThread::Delegate { - public: - Thread(void (*function)(void* arg), void* arg) - : function_(function), arg_(arg) { - ::base::PlatformThreadHandle handle; - bool success = ::base::PlatformThread::Create(0, this, &handle); - DCHECK(success); - } - virtual ~Thread() {} - virtual void ThreadMain() { - (*function_)(arg_); - delete this; - } - - private: - void (*function_)(void* arg); - void* arg_; -}; - -void ChromiumEnv::Schedule(void (*function)(void*), void* arg) { - mu_.Acquire(); - - // Start background thread if necessary - if (!started_bgthread_) { - started_bgthread_ = true; - StartThread(&ChromiumEnv::BGThreadWrapper, this); - } - - // If the queue is currently empty, the background thread may currently be - // waiting. - if (queue_.empty()) { - bgsignal_.Signal(); - } - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - - mu_.Release(); -} - -void ChromiumEnv::BGThread() { - while (true) { - // Wait until there is an item that is ready to run - mu_.Acquire(); - while (queue_.empty()) { - bgsignal_.Wait(); - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - - mu_.Release(); - (*function)(arg); - } -} - -void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) { - new Thread(function, arg); // Will self-delete. -} - -::base::LazyInstance > - default_env(::base::LINKER_INITIALIZED); - -} - -Env* Env::Default() { - return default_env.Pointer(); -} - -} -- cgit v1.2.3 From f185ab12bb3aa03e8e4e6caa1b9eae54d5ce272f Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Wed, 27 Jul 2011 01:49:08 +0000 Subject: Adding doctype to benchmark page so Google Code displays it as HTML. git-svn-id: http://leveldb.googlecode.com/svn/trunk@41 62dab493-f737-651d-591e-8d6aee1b9529 --- doc/benchmark.html | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/benchmark.html b/doc/benchmark.html index f07c04b..b84f171 100644 --- a/doc/benchmark.html +++ b/doc/benchmark.html @@ -1,3 +1,4 @@ + LevelDB Benchmarks -- cgit v1.2.3 -- cgit v1.2.3 From 9ac2fffbf9cb93afb3b1db9010c6e4f5fcc73727 Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Wed, 27 Jul 2011 04:39:46 +0000 Subject: Minor edit in benchmark page. (Baseline comparison does not make sense for large values.) git-svn-id: http://leveldb.googlecode.com/svn/trunk@43 62dab493-f737-651d-591e-8d6aee1b9529 --- doc/benchmark.html | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/doc/benchmark.html b/doc/benchmark.html index b84f171..6a79bc7 100644 --- a/doc/benchmark.html +++ b/doc/benchmark.html @@ -176,34 +176,28 @@ parameters are varied. For the baseline:

A. Large Values

For this benchmark, we start with an empty database, and write 100,000 byte values (~50% compressible). To keep the benchmark running time reasonable, we stop after writing 1000 values.

Sequential Writes

- +
- + - - + - - +
LevelDB 1,060 ops/sec
 
-
(1.17x baseline)
 
Kyoto TreeDB 1,020 ops/sec
 
(2.57x baseline)
 
SQLite3 2,910 ops/sec
 
(93.3x baseline)
 

Random Writes

- +
- - + - - + - - +
LevelDB 480 ops/sec
 
(2.52x baseline)
 
Kyoto TreeDB 1,100 ops/sec
 
(10.72x baseline)
 
SQLite3 2,200 ops/sec
 
(4,516x baseline)
 

LevelDB doesn't perform as well with large values of 100,000 bytes each. This is because LevelDB writes keys and values at least twice: first time to the transaction log, and second time (during a compaction) to a sorted file. With larger values, LevelDB's per-operation efficiency is swamped by the -- cgit v1.2.3 From 7d5b369d3e35aa61ac83a1d11425ab592a921ae7 Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Wed, 27 Jul 2011 14:29:59 +0000 Subject: Minor typos in benchmark page. git-svn-id: http://leveldb.googlecode.com/svn/trunk@44 62dab493-f737-651d-591e-8d6aee1b9529 --- doc/benchmark.html | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/benchmark.html b/doc/benchmark.html index 6a79bc7..f842118 100644 --- a/doc/benchmark.html +++ b/doc/benchmark.html @@ -103,7 +103,7 @@ div.bsql {

1. Baseline Performance

-

This section gives the baseline performance of a all of the +

This section gives the baseline performance of all the databases. Following sections show how performance changes as various parameters are varied. For the baseline:

    @@ -234,7 +234,7 @@ of sequential writes. However SQLite3 sees a significant slowdown writes. This is because each random batch write in SQLite3 has to update approximately as many pages as there are keys in the batch.

    -

    C. Synchronous writes

    +

    C. Synchronous Writes

    In the following benchmark, we enable the synchronous writing modes of all of the databases. Since this change significantly slows down the benchmark, we stop after 10,000 writes.

    @@ -329,7 +329,7 @@ better without compression than with compression. Presumably this is because TreeDB's compression library (LZO) is more expensive than LevelDB's compression library (Snappy).

    -

    E. Using more memory

    +

    E. Using More Memory

    We increased the overall cache size for each database to 128 MB. For LevelDB, we partitioned 128 MB into a 120 MB write buffer and 8 MB of cache (up from 2 MB of write buffer and 2 MB of cache). For SQLite3, we kept the page size at 1024 bytes, but increased the number of pages to 131,072 (up from 4096). For TreeDB, we also kept the page size at 1024 bytes, but increased the cache size to 128 MB (up from 4 MB).

    Sequential Writes

    @@ -370,8 +370,8 @@ because a larger write buffer reduces the need to merge sorted files performance goes up because the entire database is available in memory for fast in-place updates.

    -

    2. Read Performance under Different Configurations

    -

    A. Larger caches

    +

    3. Read Performance under Different Configurations

    +

    A. Larger Caches

    We increased the overall memory usage to 128 MB for each database. For LevelDB, we allocated 8 MB to LevelDB's write buffer and 120 MB to LevelDB's cache. The other databases don't differentiate between a @@ -414,7 +414,7 @@ when the caches are enlarged. In particular, TreeDB seems to make very effective use of a cache that is large enough to hold the entire database.

    -

    B. No compression reads

    +

    B. No Compression Reads

    For this benchmark, we populated a database with 1 million entries consisting of 16 byte keys and 100 byte values. We compiled LevelDB and Kyoto Cabinet without compression support, so results that are read out from the database are already uncompressed. We've listed the SQLite3 baseline read performance as a point of comparison.

    Sequential Reads

    -- cgit v1.2.3 From 3b3476afcc7c1cc788419a0fdcf13eb8d0e6b525 Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Fri, 29 Jul 2011 21:35:05 +0000 Subject: Improved benchmark, fixed bugs and SQLite parameters. - Based on suggestions on the sqlite-users mailing list, we removed the superfluous index on the primary key for SQLite's benchmarks, and turned write-ahead logging ("WAL") on. This led to performance improvements for SQLite. - Based on a suggestion by Florian Weimer on the leveldb mailing list, we disabled hard drive write-caching via hdparm when testing synchronous writes. This led to performance losses for LevelDB and Kyoto TreeDB. - Fixed a mistake in 2.A.->Random where the bar sizes were switched for Kyoto TreeDB and SQLite. git-svn-id: http://leveldb.googlecode.com/svn/trunk@45 62dab493-f737-651d-591e-8d6aee1b9529 --- doc/bench/db_bench_sqlite3.cc | 10 +++- doc/benchmark.html | 126 +++++++++++++++++++++--------------------- 2 files changed, 69 insertions(+), 67 deletions(-) diff --git a/doc/bench/db_bench_sqlite3.cc b/doc/bench/db_bench_sqlite3.cc index a6f9a75..a15510e 100644 --- a/doc/bench/db_bench_sqlite3.cc +++ b/doc/bench/db_bench_sqlite3.cc @@ -74,7 +74,7 @@ static bool FLAGS_use_existing_db = false; static bool FLAGS_transaction = true; // If true, we enable Write-Ahead Logging -static bool FLAGS_WAL_enabled = false; +static bool FLAGS_WAL_enabled = true; inline static void ExecErrorCheck(int status, char *err_msg) { @@ -448,16 +448,20 @@ class Benchmark { // Change journal mode to WAL if WAL enabled flag is on if (FLAGS_WAL_enabled) { std::string WAL_stmt = "PRAGMA journal_mode = WAL"; + + // LevelDB's default cache size is a combined 4 MB + std::string WAL_checkpoint = "PRAGMA wal_autocheckpoint = 4096"; status = sqlite3_exec(db_, WAL_stmt.c_str(), NULL, NULL, &err_msg); ExecErrorCheck(status, err_msg); + status = sqlite3_exec(db_, WAL_checkpoint.c_str(), NULL, NULL, &err_msg); + ExecErrorCheck(status, err_msg); } // Change locking mode to exclusive and create tables/index for database std::string locking_stmt = "PRAGMA locking_mode = EXCLUSIVE"; std::string create_stmt = "CREATE TABLE test (key blob, value blob, PRIMARY KEY(key))"; - std::string index_stmt = "CREATE INDEX keyindex ON test (key)"; - std::string stmt_array[] = { locking_stmt, create_stmt, index_stmt }; + std::string stmt_array[] = { locking_stmt, create_stmt }; int stmt_array_length = sizeof(stmt_array) / sizeof(std::string); for (int i = 0; i < stmt_array_length; i++) { status = sqlite3_exec(db_, stmt_array[i].c_str(), NULL, NULL, &err_msg); diff --git a/doc/benchmark.html b/doc/benchmark.html index f842118..a0d6b02 100644 --- a/doc/benchmark.html +++ b/doc/benchmark.html @@ -85,7 +85,7 @@ div.bsql {

    In order to test LevelDB's performance, we benchmark it against other well-established database implementations. We compare LevelDB (revision 39) against SQLite3 (version 3.7.6.3) and Kyoto Cabinet's (version 1.2.67) TreeDB (a B+Tree based key-value store). We would like to acknowledge Scott Hess and Mikio Hirabayashi for their suggestions and contributions to the SQLite3 and Kyoto Cabinet benchmarks, respectively.

    -

    Benchmarks were all performed on a six-core Intel(R) Xeon(R) CPU X5650 @ 2.67GHz, with 12288 KB of total L3 cache and 12 GB of DDR3 RAM at 1333 MHz. (Note that LevelDB uses at most two CPUs since the benchmarks are single threaded: one to run the benchmark, and one for background compactions.) We ran the benchmarks on two machines (with identical processors), one with an Ext3 file system and one with an Ext4 file system. The machine with the Ext3 file system has a SATA Hitachi HDS721050CLA362 hard drive. The machine with the Ext4 file system has a SATA Samsung HD502HJ hard drive. Both hard drives spin at 7200 RPM. The numbers reported below are the median of three measurements.

    +

    Benchmarks were all performed on a six-core Intel(R) Xeon(R) CPU X5650 @ 2.67GHz, with 12288 KB of total L3 cache and 12 GB of DDR3 RAM at 1333 MHz. (Note that LevelDB uses at most two CPUs since the benchmarks are single threaded: one to run the benchmark, and one for background compactions.) We ran the benchmarks on two machines (with identical processors), one with an Ext3 file system and one with an Ext4 file system. The machine with the Ext3 file system has a SATA Hitachi HDS721050CLA362 hard drive. The machine with the Ext4 file system has a SATA Samsung HD502HJ hard drive. Both hard drives spin at 7200 RPM and have hard drive write-caching enabled (using `hdparm -W 1 [device]`). The numbers reported below are the median of three measurements.

    Benchmark Source Code

    We wrote benchmark tools for SQLite and Kyoto TreeDB based on LevelDB's db_bench. The code for each of the benchmarks resides here:

    @@ -97,9 +97,9 @@ div.bsql {

    Custom Build Specifications

      -
    • LevelDB: LevelDB was compiled with the tcmalloc library and the Snappy compression library. Assertions were disabled.
    • -
    • TreeDB: TreeDB was compiled using the LZO compression library. Furthermore, we enabled the TSMALL and TLINEAR options when opening the database in order to reduce the footprint of each record.
    • -
    • SQLite: We tuned SQLite's performance, by setting its locking mode to exclusive. We left SQLite's write-ahead logging disabled since that is the default configuration. (Enabling write-ahead-logging improves SQLite's write performance by roughly 30%, but the character of the comparisons below does not change significantly.)
    • +
    • LevelDB: LevelDB was compiled with the tcmalloc library and the Snappy compression library (revision 33). Assertions were disabled.
    • +
    • TreeDB: TreeDB was compiled using the LZO compression library (version 2.03). Furthermore, we enabled the TSMALL and TLINEAR options when opening the database in order to reduce the footprint of each record.
    • +
    • SQLite: We tuned SQLite's performance, by setting its locking mode to exclusive. We also enabled SQLite's write-ahead logging.

    1. Baseline Performance

    @@ -130,8 +130,8 @@ parameters are varied. For the baseline:

    - - + +
    1,010,000 ops/sec
     
    SQLite3186,000 ops/sec
     
    174,000 ops/sec
     

    B. Random Reads

    @@ -142,8 +142,8 @@ parameters are varied. For the baseline:

    - - + +
    151,000 ops/sec
     
    SQLite3146,000 ops/sec
     
    134,000 ops/sec
     

    C. Sequential Writes

    @@ -154,8 +154,8 @@ parameters are varied. For the baseline:

    - - + +
    342,000 ops/sec
     
    SQLite326,900 ops/sec
     
    48,600 ops/sec
     

    D. Random Writes

    @@ -166,8 +166,8 @@ parameters are varied. For the baseline:

    - - + +
    88,500 ops/sec
     
    SQLite3420 ops/sec
     
    9,860 ops/sec
     

    LevelDB outperforms both SQLite3 and TreeDB in sequential and random write operations and sequential read operations. Kyoto Cabinet has the fastest random read operations.

    @@ -178,26 +178,26 @@ parameters are varied. For the baseline:

    Sequential Writes

    - - + + - - + + - +
    LevelDB1,060 ops/sec
     
    1,100 ops/sec
     
    Kyoto TreeDB1,020 ops/sec
     
    1,000 ops/sec
     
    SQLite32,910 ops/sec1,600 ops/sec
     

    Random Writes

    - + - + - - + +
    LevelDB 480 ops/sec
     
     
    Kyoto TreeDB 1,100 ops/sec
     
     
    SQLite32,200 ops/sec
     
    1,600 ops/sec
     

    LevelDB doesn't perform as well with large values of 100,000 bytes each. This is because LevelDB writes keys and values at least twice: first time to the transaction log, and second time (during a compaction) to a sorted file. With larger values, LevelDB's per-operation efficiency is swamped by the @@ -211,9 +211,9 @@ cost of extra copies of large values.

     
    (1.08x baseline) SQLite3 - 100,000 entries/sec -
     
    - (3.72x baseline) + 124,000 entries/sec +
     
    + (2.55x baseline)

    Random Writes

    @@ -222,22 +222,20 @@ cost of extra copies of large values.

    - - - + + +
     
    (1.35x baseline)
    SQLite31,000 entries/sec
     
    (2.38x baseline)
    22,000 entries/sec
     
    (2.23x baseline)

    Because of the way LevelDB persistent storage is organized, batches of random writes are not much slower (only a factor of 4x) than batches -of sequential writes. However SQLite3 sees a significant slowdown -(factor of 100x) when switching from sequential to random batch -writes. This is because each random batch write in SQLite3 has to -update approximately as many pages as there are keys in the batch.

    +of sequential writes.

    C. Synchronous Writes

    In the following benchmark, we enable the synchronous writing modes of all of the databases. Since this change significantly slows down the -benchmark, we stop after 10,000 writes.

    +benchmark, we stop after 10,000 writes. For synchronous write tests, we've +disabled hard drive write-caching (using `hdparm -W 0 [device]`).

    • For LevelDB, we set WriteOptions.sync = true.
    • In TreeDB, we enabled TreeDB's OAUTOSYNC option.
    • @@ -246,32 +244,32 @@ benchmark, we stop after 10,000 writes.

      Sequential Writes

      - + - - + + - - - + + +
      LevelDB2,400 ops/sec100 ops/sec
       
      (0.003x baseline)
      Kyoto TreeDB140 ops/sec
       
      7 ops/sec
       
      (0.0004x baseline)
      SQLite3430 ops/sec
       
      (0.016x baseline)
      88 ops/sec
       
      (0.002x baseline)

      Random Writes

      - + - - + + - - - + + +
      LevelDB2,400 ops/sec100 ops/sec
       
      (0.015x baseline)
      Kyoto TreeDB100 ops/sec
       
      8 ops/sec
       
      (0.001x baseline)
      SQLite3110 ops/sec
       
      (0.26x baseline)
      88 ops/sec
       
      (0.009x baseline)

      Also see the ext4 performance numbers below @@ -300,8 +298,8 @@ its baseline measurements):

       
      (1.42x baseline) SQLite3 - 26,900 ops/sec -
       
      + 48,600 ops/sec +
       
      (1.00x baseline)

      Random Writes

      @@ -315,8 +313,8 @@ its baseline measurements):

       
      (1.80x baseline) SQLite3 - 420 ops/sec -
       
      + 9,860 ops/sec +
       
      (1.00x baseline) @@ -342,9 +340,9 @@ LevelDB's compression library (Snappy).

       
      (0.94x baseline) SQLite3 - 26,200 ops/sec -
       
      - (0.97x baseline) + 48,500 ops/sec +
       
      + (1.00x baseline)

      Random Writes

      @@ -357,9 +355,9 @@ LevelDB's compression library (Snappy).

      - - - + + +
       
      (3.21x baseline)
      SQLite3450 ops/sec
       
      (1.07x baseline)
      9,670 ops/sec
       
      (0.98x baseline)

      SQLite's performance does not change substantially when compared to @@ -388,9 +386,9 @@ MB.

       
      (1.06x baseline) SQLite3 - 221,000 ops/sec -
       
      - (1.19x baseline) + 210,000 ops/sec +
       
      + (1.20x baseline)

      Random Reads

      @@ -404,9 +402,9 @@ MB.

       
      (3.07x baseline) SQLite3 - 197,000 ops/sec -
       
      - (1.35x baseline) + 186,000 ops/sec +
       
      + (1.39x baseline)

      As expected, the read performance of all of the databases increases @@ -427,7 +425,7 @@ database.

       
      (3.60x baseline) SQLite3 - 186,000 ops/sec + 174,000 ops/sec
       
      (1.00x baseline) @@ -442,8 +440,8 @@ database.

       
      (1.16x baseline) SQLite3 - 146,000 ops/sec -
       
      + 134,000 ops/sec +
       
      (1.00x baseline) @@ -453,7 +451,7 @@ performance may very well be better with compression if it allows more of the working set to fit in memory.

      Note about Ext4 Filesystems

      -

      The preceding numbers are for an ext3 file system. Synchronous writes are much slower under ext4 (LevelDB drops to ~34 writes / second, TreeDB drops to ~5 writes / second; SQLite3 drops to ~24 writes / second) due to ext4's different handling of fsync / msync calls. Even LevelDB's asynchronous write performance drops somewhat since it spreads its storage across multiple files and issues fsync calls when switching to a new file.

      +

      The preceding numbers are for an ext3 file system. Synchronous writes are much slower under ext4 (LevelDB drops to ~31 writes / second and TreeDB drops to ~5 writes / second; SQLite3's synchronous writes do not noticeably drop) due to ext4's different handling of fsync / msync calls. Even LevelDB's asynchronous write performance drops somewhat since it spreads its storage across multiple files and issues fsync calls when switching to a new file.

      Acknowledgements

      Jeff Dean and Sanjay Ghemawat wrote LevelDB. Kevin Tseng wrote and compiled these benchmarks. Mikio Hirabayashi, Scott Hess, and Gabor Cselle provided help and advice.

      -- cgit v1.2.3 From d2bd50ef02756a6a92bd6d7c65c045f3c7297090 Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Fri, 5 Aug 2011 20:40:49 +0000 Subject: C binding for leveldb, better readseq benchmark for SQLite. - Added a C binding for LevelDB. May be useful as a stable ABI that can be used by programs that keep leveldb in a shared library, or for JNI API. - Replaced SQLite's readseq benchmark to a more efficient version. SQLite readseq speeds increased by about a factor of 2x from the previous version. Also updated benchmark page to reflect readseq speed up. git-svn-id: http://leveldb.googlecode.com/svn/trunk@46 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 16 ++ db/c.cc | 453 ++++++++++++++++++++++++++++++++++++++++++ db/c_test.c | 295 +++++++++++++++++++++++++++ doc/bench/db_bench_sqlite3.cc | 26 ++- doc/benchmark.html | 14 +- include/leveldb/c.h | 246 +++++++++++++++++++++++ include/leveldb/db.h | 1 + 7 files changed, 1036 insertions(+), 15 deletions(-) create mode 100644 db/c.cc create mode 100644 db/c_test.c create mode 100644 include/leveldb/c.h diff --git a/Makefile b/Makefile index d198d9d..86e24e8 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ LDFLAGS=$(PLATFORM_LDFLAGS) $(SNAPPY_LDFLAGS) $(GOOGLE_PERFTOOLS_LDFLAGS) LIBOBJECTS = \ ./db/builder.o \ + ./db/c.o \ ./db/db_impl.o \ ./db/db_iter.o \ ./db/filename.o \ @@ -81,6 +82,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL) TESTS = \ arena_test \ + c_test \ cache_test \ coding_test \ corruption_test \ @@ -127,6 +129,9 @@ db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ +c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CC) $(LDFLAGS) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ @@ -182,8 +187,19 @@ IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version mkdir -p ios-arm/$(dir $@) $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ + +.c.o: + mkdir -p ios-x86/$(dir $@) + $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ + else .cc.o: $(CC) $(CFLAGS) $< -o $@ + +.c.o: + $(CC) $(CFLAGS) $< -o $@ endif diff --git a/db/c.cc b/db/c.cc new file mode 100644 index 0000000..ee8a472 --- /dev/null +++ b/db/c.cc @@ -0,0 +1,453 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/c.h" + +#include +#include +#include "leveldb/cache.h" +#include "leveldb/comparator.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "leveldb/options.h" +#include "leveldb/status.h" +#include "leveldb/write_batch.h" + +namespace leveldb { + +extern "C" { + +struct leveldb_t { DB* rep; }; +struct leveldb_iterator_t { Iterator* rep; }; +struct leveldb_writebatch_t { WriteBatch rep; }; +struct leveldb_snapshot_t { const Snapshot* rep; }; +struct leveldb_readoptions_t { ReadOptions rep; }; +struct leveldb_writeoptions_t { WriteOptions rep; }; +struct leveldb_options_t { Options rep; }; +struct leveldb_cache_t { Cache* rep; }; +struct leveldb_seqfile_t { SequentialFile* rep; }; +struct leveldb_randomfile_t { RandomAccessFile* rep; }; +struct leveldb_writablefile_t { WritableFile* rep; }; +struct leveldb_logger_t { Logger* rep; }; +struct leveldb_filelock_t { FileLock* rep; }; + +struct leveldb_comparator_t : public Comparator { + void* state_; + void (*destructor_)(void*); + int (*compare_)( + void*, + const char* a, size_t alen, + const char* b, size_t blen); + const char* (*name_)(void*); + + virtual ~leveldb_comparator_t() { + (*destructor_)(state_); + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return (*compare_)(state_, a.data(), a.size(), b.data(), b.size()); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + // No-ops since the C binding does not support key shortening methods. + virtual void FindShortestSeparator(std::string*, const Slice&) const { } + virtual void FindShortSuccessor(std::string* key) const { } +}; + +struct leveldb_env_t { + Env* rep; + bool is_default; +}; + +static bool SaveError(char** errptr, const Status& s) { + assert(errptr != NULL); + if (s.ok()) { + return false; + } else if (*errptr == NULL) { + *errptr = strdup(s.ToString().c_str()); + } else { + // TODO(sanjay): Merge with existing error? + free(*errptr); + *errptr = strdup(s.ToString().c_str()); + } + return true; +} + +static char* CopyString(const std::string& str) { + char* result = reinterpret_cast(malloc(sizeof(char) * str.size())); + memcpy(result, str.data(), sizeof(char) * str.size()); + return result; +} + +leveldb_t* leveldb_open( + const leveldb_options_t* options, + const char* name, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { + return NULL; + } + leveldb_t* result = new leveldb_t; + result->rep = db; + return result; +} + +void leveldb_close(leveldb_t* db) { + delete db->rep; + delete db; +} + +void leveldb_put( + leveldb_t* db, + const leveldb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void leveldb_delete( + leveldb_t* db, + const leveldb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); +} + + +void leveldb_write( + leveldb_t* db, + const leveldb_writeoptions_t* options, + leveldb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +char* leveldb_get( + leveldb_t* db, + const leveldb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = NULL; + std::string tmp; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +leveldb_iterator_t* leveldb_create_iterator( + leveldb_t* db, + const leveldb_readoptions_t* options) { + leveldb_iterator_t* result = new leveldb_iterator_t; + result->rep = db->rep->NewIterator(options->rep); + return result; +} + +const leveldb_snapshot_t* leveldb_create_snapshot( + leveldb_t* db) { + leveldb_snapshot_t* result = new leveldb_snapshot_t; + result->rep = db->rep->GetSnapshot(); + return result; +} + +void leveldb_release_snapshot( + leveldb_t* db, + const leveldb_snapshot_t* snapshot) { + db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +const char* leveldb_property_value( + leveldb_t* db, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + return CopyString(tmp); + } else { + return NULL; + } +} + +void leveldb_approximate_sizes( + leveldb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + delete[] ranges; +} + +void leveldb_destroy_db( + const leveldb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, DestroyDB(name, options->rep)); +} + +void leveldb_repair_db( + const leveldb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, RepairDB(name, options->rep)); +} + +void leveldb_iter_destroy(leveldb_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) { + return iter->rep->Valid(); +} + +void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) { + iter->rep->SeekToFirst(); +} + +void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) { + iter->rep->SeekToLast(); +} + +void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) { + iter->rep->Seek(Slice(k, klen)); +} + +void leveldb_iter_next(leveldb_iterator_t* iter) { + iter->rep->Next(); +} + +void leveldb_iter_prev(leveldb_iterator_t* iter) { + iter->rep->Prev(); +} + +const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) { + Slice s = iter->rep->key(); + *klen = s.size(); + return s.data(); +} + +const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) { + Slice s = iter->rep->value(); + *vlen = s.size(); + return s.data(); +} + +void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +leveldb_writebatch_t* leveldb_writebatch_create() { + return new leveldb_writebatch_t; +} + +void leveldb_writebatch_destroy(leveldb_writebatch_t* b) { + delete b; +} + +void leveldb_writebatch_clear(leveldb_writebatch_t* b) { + b->rep.Clear(); +} + +void leveldb_writebatch_put( + leveldb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Put(Slice(key, klen), Slice(val, vlen)); +} + +void leveldb_writebatch_delete( + leveldb_writebatch_t* b, + const char* key, size_t klen) { + b->rep.Delete(Slice(key, klen)); +} + +void leveldb_writebatch_iterate( + leveldb_writebatch_t* b, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)) { + class H : public WriteBatch::Handler { + public: + void* state_; + void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); + void (*deleted_)(void*, const char* k, size_t klen); + virtual void Put(const Slice& key, const Slice& value) { + (*put_)(state_, key.data(), key.size(), value.data(), value.size()); + } + virtual void Delete(const Slice& key) { + (*deleted_)(state_, key.data(), key.size()); + } + }; + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep.Iterate(&handler); +} + +leveldb_options_t* leveldb_options_create() { + return new leveldb_options_t; +} + +void leveldb_options_destroy(leveldb_options_t* options) { + delete options; +} + +void leveldb_options_set_comparator( + leveldb_options_t* opt, + leveldb_comparator_t* cmp) { + opt->rep.comparator = cmp; +} + +void leveldb_options_set_create_if_missing( + leveldb_options_t* opt, unsigned char v) { + opt->rep.create_if_missing = v; +} + +void leveldb_options_set_error_if_exists( + leveldb_options_t* opt, unsigned char v) { + opt->rep.error_if_exists = v; +} + +void leveldb_options_set_paranoid_checks( + leveldb_options_t* opt, unsigned char v) { + opt->rep.paranoid_checks = v; +} + +void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) { + opt->rep.env = (env ? env->rep : NULL); +} + +void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) { + opt->rep.info_log = (l ? l->rep : NULL); +} + +void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) { + opt->rep.write_buffer_size = s; +} + +void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) { + opt->rep.max_open_files = n; +} + +void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) { + opt->rep.block_cache = c->rep; +} + +void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) { + opt->rep.block_size = s; +} + +void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) { + opt->rep.block_restart_interval = n; +} + +void leveldb_options_set_compression(leveldb_options_t* opt, int t) { + opt->rep.compression = static_cast(t); +} + +leveldb_comparator_t* leveldb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)) { + leveldb_comparator_t* result = new leveldb_comparator_t; + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->name_ = name; + return result; +} + +void leveldb_comparator_destroy(leveldb_comparator_t* cmp) { + delete cmp; +} + +leveldb_readoptions_t* leveldb_readoptions_create() { + return new leveldb_readoptions_t; +} + +void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) { + delete opt; +} + +void leveldb_readoptions_set_verify_checksums( + leveldb_readoptions_t* opt, + unsigned char v) { + opt->rep.verify_checksums = v; +} + +void leveldb_readoptions_set_fill_cache( + leveldb_readoptions_t* opt, unsigned char v) { + opt->rep.fill_cache = v; +} + +void leveldb_readoptions_set_snapshot( + leveldb_readoptions_t* opt, + const leveldb_snapshot_t* snap) { + opt->rep.snapshot = (snap ? snap->rep : NULL); +} + +leveldb_writeoptions_t* leveldb_writeoptions_create() { + return new leveldb_writeoptions_t; +} + +void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) { + delete opt; +} + +void leveldb_writeoptions_set_sync( + leveldb_writeoptions_t* opt, unsigned char v) { + opt->rep.sync = v; +} + +leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) { + leveldb_cache_t* c = new leveldb_cache_t; + c->rep = NewLRUCache(capacity); + return c; +} + +void leveldb_cache_destroy(leveldb_cache_t* cache) { + delete cache->rep; + delete cache; +} + +leveldb_env_t* leveldb_create_default_env() { + leveldb_env_t* result = new leveldb_env_t; + result->rep = Env::Default(); + result->is_default = true; + return result; +} + +void leveldb_env_destroy(leveldb_env_t* env) { + if (!env->is_default) delete env->rep; + delete env; +} + +} // end extern "C" + +} diff --git a/db/c_test.c b/db/c_test.c new file mode 100644 index 0000000..9fef325 --- /dev/null +++ b/db/c_test.c @@ -0,0 +1,295 @@ +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. */ + +#include "leveldb/c.h" + +#include +#include +#include +#include +#include +#include + +const char* phase = ""; +static char dbname[200]; + +static void StartPhase(const char* name) { + fprintf(stderr, "=== Test %s\n", name); + phase = name; +} + +#define CheckNoError(err) \ + if ((err) != NULL) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ + abort(); \ + } + +#define CheckCondition(cond) \ + if (!(cond)) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ + abort(); \ + } + +static void CheckEqual(const char* expected, const char* v, size_t n) { + if (expected == NULL && v == NULL) { + // ok + } else if (expected != NULL && v != NULL && n == strlen(expected) && + memcmp(expected, v, n) == 0) { + // ok + return; + } else { + fprintf(stderr, "%s: expected '%s', got '%s'\n", + phase, + (expected ? expected : "(null)"), + (v ? v : "(null")); + abort(); + } +} + +static void Free(char** ptr) { + if (*ptr) { + free(*ptr); + *ptr = NULL; + } +} + +static void CheckGet( + leveldb_t* db, + const leveldb_readoptions_t* options, + const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = leveldb_get(db, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckIter(leveldb_iterator_t* iter, + const char* key, const char* val) { + size_t len; + const char* str; + str = leveldb_iter_key(iter, &len); + CheckEqual(key, str, len); + str = leveldb_iter_value(iter, &len); + CheckEqual(val, str, len); +} + +// Callback from leveldb_writebatch_iterate() +static void CheckPut(void* ptr, + const char* k, size_t klen, + const char* v, size_t vlen) { + int* state = (int*) ptr; + CheckCondition(*state < 2); + switch (*state) { + case 0: + CheckEqual("bar", k, klen); + CheckEqual("b", v, vlen); + break; + case 1: + CheckEqual("box", k, klen); + CheckEqual("c", v, vlen); + break; + } + (*state)++; +} + +// Callback from leveldb_writebatch_iterate() +static void CheckDel(void* ptr, const char* k, size_t klen) { + int* state = (int*) ptr; + CheckCondition(*state == 2); + CheckEqual("bar", k, klen); + (*state)++; +} + +static void CmpDestroy(void* arg) { } + +static int CmpCompare(void* arg, const char* a, size_t alen, + const char* b, size_t blen) { + int n = (alen < blen) ? alen : blen; + int r = memcmp(a, b, n); + if (r == 0) { + if (alen < blen) r = -1; + else if (alen > blen) r = +1; + } + return r; +} + +static const char* CmpName(void* arg) { + return "foo"; +} + +int main(int argc, char** argv) { + leveldb_t* db; + leveldb_comparator_t* cmp; + leveldb_cache_t* cache; + leveldb_env_t* env; + leveldb_options_t* options; + leveldb_readoptions_t* roptions; + leveldb_writeoptions_t* woptions; + char* err = NULL; + + snprintf(dbname, sizeof(dbname), "/tmp/leveldb_c_test-%d", + ((int) geteuid())); + + StartPhase("create_objects"); + cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); + env = leveldb_create_default_env(); + cache = leveldb_cache_create_lru(100000); + + options = leveldb_options_create(); + leveldb_options_set_comparator(options, cmp); + leveldb_options_set_error_if_exists(options, 1); + leveldb_options_set_cache(options, cache); + leveldb_options_set_env(options, env); + leveldb_options_set_info_log(options, NULL); + leveldb_options_set_write_buffer_size(options, 100000); + leveldb_options_set_paranoid_checks(options, 1); + leveldb_options_set_max_open_files(options, 10); + leveldb_options_set_block_size(options, 1024); + leveldb_options_set_block_restart_interval(options, 8); + leveldb_options_set_compression(options, leveldb_no_compression); + + roptions = leveldb_readoptions_create(); + leveldb_readoptions_set_verify_checksums(roptions, 1); + leveldb_readoptions_set_fill_cache(roptions, 0); + + woptions = leveldb_writeoptions_create(); + leveldb_writeoptions_set_sync(woptions, 1); + + StartPhase("destroy"); + leveldb_destroy_db(options, dbname, &err); + Free(&err); + + StartPhase("open_error"); + db = leveldb_open(options, dbname, &err); + CheckCondition(err != NULL); + Free(&err); + + StartPhase("open"); + leveldb_options_set_create_if_missing(options, 1); + db = leveldb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + + StartPhase("put"); + leveldb_put(db, woptions, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("writebatch"); + { + leveldb_writebatch_t* wb = leveldb_writebatch_create(); + leveldb_writebatch_put(wb, "foo", 3, "a", 1); + leveldb_writebatch_clear(wb); + leveldb_writebatch_put(wb, "bar", 3, "b", 1); + leveldb_writebatch_put(wb, "box", 3, "c", 1); + leveldb_writebatch_delete(wb, "bar", 3); + leveldb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + int pos = 0; + leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); + CheckCondition(pos == 3); + leveldb_writebatch_destroy(wb); + } + + StartPhase("iter"); + { + leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions); + CheckCondition(!leveldb_iter_valid(iter)); + leveldb_iter_seek_to_first(iter); + CheckCondition(leveldb_iter_valid(iter)); + CheckIter(iter, "box", "c"); + leveldb_iter_next(iter); + CheckIter(iter, "foo", "hello"); + leveldb_iter_prev(iter); + CheckIter(iter, "box", "c"); + leveldb_iter_prev(iter); + CheckCondition(!leveldb_iter_valid(iter)); + leveldb_iter_seek_to_last(iter); + CheckIter(iter, "foo", "hello"); + leveldb_iter_seek(iter, "b", 1); + CheckIter(iter, "box", "c"); + leveldb_iter_get_error(iter, &err); + CheckNoError(err); + leveldb_iter_destroy(iter); + } + + StartPhase("approximate_sizes"); + { + int i; + int n = 20000; + char keybuf[100]; + char valbuf[100]; + uint64_t sizes[2]; + const char* start[2] = { "a", "k00000000000000010000" }; + size_t start_len[2] = { 1, 21 }; + const char* limit[2] = { "k00000000000000010000", "z" }; + size_t limit_len[2] = { 21, 1 }; + leveldb_writeoptions_set_sync(woptions, 0); + for (i = 0; i < n; i++) { + snprintf(keybuf, sizeof(keybuf), "k%020d", i); + snprintf(valbuf, sizeof(valbuf), "v%020d", i); + leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), + &err); + CheckNoError(err); + } + leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + CheckCondition(sizes[0] > 0); + CheckCondition(sizes[1] > 0); + } + + StartPhase("property"); + { + char* prop = leveldb_property_value(db, "nosuchprop"); + CheckCondition(prop == NULL); + prop = leveldb_property_value(db, "leveldb.stats"); + CheckCondition(prop != NULL); + Free(&prop); + } + + StartPhase("snapshot"); + { + const leveldb_snapshot_t* snap; + snap = leveldb_create_snapshot(db); + leveldb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + leveldb_readoptions_set_snapshot(roptions, snap); + CheckGet(db, roptions, "foo", "hello"); + leveldb_readoptions_set_snapshot(roptions, NULL); + CheckGet(db, roptions, "foo", NULL); + leveldb_release_snapshot(db, snap); + } + + StartPhase("repair"); + { + leveldb_close(db); + leveldb_options_set_create_if_missing(options, 0); + leveldb_options_set_error_if_exists(options, 0); + leveldb_repair_db(options, dbname, &err); + CheckNoError(err); + db = leveldb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + } + + StartPhase("cleanup"); + leveldb_close(db); + leveldb_options_destroy(options); + leveldb_readoptions_destroy(roptions); + leveldb_writeoptions_destroy(woptions); + leveldb_cache_destroy(cache); + leveldb_comparator_destroy(cmp); + leveldb_env_destroy(env); + + fprintf(stderr, "PASS\n"); + return 0; +} diff --git a/doc/bench/db_bench_sqlite3.cc b/doc/bench/db_bench_sqlite3.cc index a15510e..e11db52 100644 --- a/doc/bench/db_bench_sqlite3.cc +++ b/doc/bench/db_bench_sqlite3.cc @@ -23,7 +23,6 @@ // fillseq100K -- write N/1000 100K values in sequential order in async mode // readseq -- read N times sequentially // readrandom -- read N times in random order -// readseq100K -- read N/1000 100K values in sequential order in async mode // readrand100K -- read N/1000 100K values in sequential order in async mode static const char* FLAGS_benchmarks = "fillseq," @@ -38,7 +37,7 @@ static const char* FLAGS_benchmarks = "readseq," "fillrand100K," "fillseq100K," - "readseq100K," + "readseq," "readrand100K," ; @@ -387,7 +386,7 @@ class Benchmark { Write(write_sync, SEQUENTIAL, FRESH, num_ / 1000, 100 * 1000, 1); WalCheckpoint(db_); } else if (name == Slice("readseq")) { - Read(SEQUENTIAL, 1); + ReadSequential(); } else if (name == Slice("readrandom")) { Read(RANDOM, 1); } else if (name == Slice("readrand100K")) { @@ -395,11 +394,6 @@ class Benchmark { reads_ /= 1000; Read(RANDOM, 1); reads_ = n; - } else if (name == Slice("readseq100K")) { - int n = reads_; - reads_ /= 1000; - Read(SEQUENTIAL, 1); - reads_ = n; } else { known = false; if (name != Slice()) { // No error message for empty name @@ -640,6 +634,22 @@ class Benchmark { ErrorCheck(status); } + void ReadSequential() { + int status; + sqlite3_stmt *pStmt; + std::string read_str = "SELECT * FROM test ORDER BY key"; + + status = sqlite3_prepare_v2(db_, read_str.c_str(), -1, &pStmt, NULL); + ErrorCheck(status); + for (int i = 0; i < reads_ && SQLITE_ROW == sqlite3_step(pStmt); i++) { + bytes_ += sqlite3_column_bytes(pStmt, 1) + sqlite3_column_bytes(pStmt, 2); + FinishedSingleOp(); + } + + status = sqlite3_finalize(pStmt); + ErrorCheck(status); + } + }; } diff --git a/doc/benchmark.html b/doc/benchmark.html index a0d6b02..c463977 100644 --- a/doc/benchmark.html +++ b/doc/benchmark.html @@ -130,8 +130,8 @@ parameters are varied. For the baseline:

      1,010,000 ops/sec
       
      SQLite3 - 174,000 ops/sec -
       
      + 383,000 ops/sec +
       

      B. Random Reads

      @@ -386,9 +386,9 @@ MB.

      - - - + + +
       
      (1.06x baseline)
      SQLite3210,000 ops/sec
       
      (1.20x baseline)
      609,000 ops/sec
       
      (1.59x baseline)

      Random Reads

      @@ -425,8 +425,8 @@ database.

       
      (3.60x baseline) SQLite3 - 174,000 ops/sec -
       
      + 383,000 ops/sec +
       
      (1.00x baseline)

      Random Reads

      diff --git a/include/leveldb/c.h b/include/leveldb/c.h new file mode 100644 index 0000000..0be993d --- /dev/null +++ b/include/leveldb/c.h @@ -0,0 +1,246 @@ +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. + + C bindings for leveldb. May be useful as a stable ABI that can be + used by programs that keep leveldb in a shared library, or for + a JNI api. + + Does not support: + . getters for the option types + . custom comparators that implement key shortening + . capturing post-write-snapshot + . custom iter, db, env, cache implementations using just the C bindings + + Some conventions: + + (1) We expose just opaque struct pointers and functions to clients. + This allows us to change internal representations without having to + recompile clients. + + (2) For simplicity, there is no equivalent to the Slice type. Instead, + the caller has to pass the pointer and length as separate + arguments. + + (3) Errors are represented by a null-terminated c string. NULL + means no error. All operations that can raise an error are passed + a "char** errptr" as the last argument. One of the following must + be true on entry: + *errptr == NULL + *errptr points to a malloc()ed null-terminated error message + On success, a leveldb routine leaves *errptr unchanged. + On failure, leveldb frees the old value of *errptr and + set *errptr to a malloc()ed error message. + + (4) Bools have the type unsigned char (0 == false; rest == true) + + (5) All of the pointer arguments must be non-NULL. +*/ + +#ifndef STORAGE_LEVELDB_INCLUDE_C_H_ +#define STORAGE_LEVELDB_INCLUDE_C_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* Exported types */ + +typedef struct leveldb_t leveldb_t; +typedef struct leveldb_cache_t leveldb_cache_t; +typedef struct leveldb_comparator_t leveldb_comparator_t; +typedef struct leveldb_env_t leveldb_env_t; +typedef struct leveldb_filelock_t leveldb_filelock_t; +typedef struct leveldb_iterator_t leveldb_iterator_t; +typedef struct leveldb_logger_t leveldb_logger_t; +typedef struct leveldb_options_t leveldb_options_t; +typedef struct leveldb_randomfile_t leveldb_randomfile_t; +typedef struct leveldb_readoptions_t leveldb_readoptions_t; +typedef struct leveldb_seqfile_t leveldb_seqfile_t; +typedef struct leveldb_snapshot_t leveldb_snapshot_t; +typedef struct leveldb_writablefile_t leveldb_writablefile_t; +typedef struct leveldb_writebatch_t leveldb_writebatch_t; +typedef struct leveldb_writeoptions_t leveldb_writeoptions_t; + +/* DB operations */ + +extern leveldb_t* leveldb_open( + const leveldb_options_t* options, + const char* name, + char** errptr); + +extern void leveldb_close(leveldb_t* db); + +extern void leveldb_put( + leveldb_t* db, + const leveldb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr); + +extern void leveldb_delete( + leveldb_t* db, + const leveldb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr); + +extern void leveldb_write( + leveldb_t* db, + const leveldb_writeoptions_t* options, + leveldb_writebatch_t* batch, + char** errptr); + +/* Returns NULL if not found. A malloc()ed array otherwise. + Stores the length of the array in *vallen. */ +extern char* leveldb_get( + leveldb_t* db, + const leveldb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr); + +extern leveldb_iterator_t* leveldb_create_iterator( + leveldb_t* db, + const leveldb_readoptions_t* options); + +extern const leveldb_snapshot_t* leveldb_create_snapshot( + leveldb_t* db); + +extern void leveldb_release_snapshot( + leveldb_t* db, + const leveldb_snapshot_t* snapshot); + +/* Returns NULL if property name is unknown. + Else returns a pointer to a malloc()-ed null-terminated value. */ +extern char* leveldb_property_value( + leveldb_t* db, + const char* propname); + +extern void leveldb_approximate_sizes( + leveldb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes); + +/* Management operations */ + +extern void leveldb_destroy_db( + const leveldb_options_t* options, + const char* name, + char** errptr); + +extern void leveldb_repair_db( + const leveldb_options_t* options, + const char* name, + char** errptr); + +/* Iterator */ + +extern void leveldb_iter_destroy(leveldb_iterator_t*); +extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*); +extern void leveldb_iter_seek_to_first(leveldb_iterator_t*); +extern void leveldb_iter_seek_to_last(leveldb_iterator_t*); +extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen); +extern void leveldb_iter_next(leveldb_iterator_t*); +extern void leveldb_iter_prev(leveldb_iterator_t*); +extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen); +extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen); +extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr); + +/* Write batch */ + +extern leveldb_writebatch_t* leveldb_writebatch_create(); +extern void leveldb_writebatch_destroy(leveldb_writebatch_t*); +extern void leveldb_writebatch_clear(leveldb_writebatch_t*); +extern void leveldb_writebatch_put( + leveldb_writebatch_t*, + const char* key, size_t klen, + const char* val, size_t vlen); +extern void leveldb_writebatch_delete( + leveldb_writebatch_t*, + const char* key, size_t klen); +extern void leveldb_writebatch_iterate( + leveldb_writebatch_t*, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); + +/* Options */ + +extern leveldb_options_t* leveldb_options_create(); +extern void leveldb_options_destroy(leveldb_options_t*); +extern void leveldb_options_set_comparator( + leveldb_options_t*, + leveldb_comparator_t*); +extern void leveldb_options_set_create_if_missing( + leveldb_options_t*, unsigned char); +extern void leveldb_options_set_error_if_exists( + leveldb_options_t*, unsigned char); +extern void leveldb_options_set_paranoid_checks( + leveldb_options_t*, unsigned char); +extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*); +extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*); +extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t); +extern void leveldb_options_set_max_open_files(leveldb_options_t*, int); +extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*); +extern void leveldb_options_set_block_size(leveldb_options_t*, size_t); +extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int); + +enum { + leveldb_no_compression = 0, + leveldb_snappy_compression = 1 +}; +extern void leveldb_options_set_compression(leveldb_options_t*, int); + +/* Comparator */ + +extern leveldb_comparator_t* leveldb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)); +extern void leveldb_comparator_destroy(leveldb_comparator_t*); + +/* Read options */ + +extern leveldb_readoptions_t* leveldb_readoptions_create(); +extern void leveldb_readoptions_destroy(leveldb_readoptions_t*); +extern void leveldb_readoptions_set_verify_checksums( + leveldb_readoptions_t*, + unsigned char); +extern void leveldb_readoptions_set_fill_cache( + leveldb_readoptions_t*, unsigned char); +extern void leveldb_readoptions_set_snapshot( + leveldb_readoptions_t*, + const leveldb_snapshot_t*); + +/* Write options */ + +extern leveldb_writeoptions_t* leveldb_writeoptions_create(); +extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*); +extern void leveldb_writeoptions_set_sync( + leveldb_writeoptions_t*, unsigned char); + +/* Cache */ + +extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity); +extern void leveldb_cache_destroy(leveldb_cache_t* cache); + +/* Env */ + +extern leveldb_env_t* leveldb_create_default_env(); +extern void leveldb_env_destroy(leveldb_env_t*); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif + +#endif /* STORAGE_LEVELDB_INCLUDE_C_H_ */ diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 79bd283..f945dd7 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -33,6 +33,7 @@ struct Range { Slice start; // Included in the range Slice limit; // Not included in the range + Range() { } Range(const Slice& s, const Slice& l) : start(s), limit(l) { } }; -- cgit v1.2.3 From fbe4e3af3f4e368e0779b6d75cd6005d67469aa2 Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Sat, 6 Aug 2011 00:19:37 +0000 Subject: @23023120 git-svn-id: http://leveldb.googlecode.com/svn/trunk@47 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_impl.cc | 2 +- db/version_set.cc | 10 +++++----- util/cache.cc | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 5a0648e..fff4eaf 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1141,7 +1141,7 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { in.remove_prefix(strlen("num-files-at-level")); uint64_t level; bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || level < 0 || level >= config::kNumLevels) { + if (!ok || level >= config::kNumLevels) { return false; } else { char buf[100]; diff --git a/db/version_set.cc b/db/version_set.cc index 5040b72..aace624 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -103,7 +103,7 @@ bool SomeFileOverlapsRange( const Slice& largest_user_key) { // Find the earliest possible internal key for smallest_user_key InternalKey small(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); - const int index = FindFile(icmp, files, small.Encode()); + const uint32_t index = FindFile(icmp, files, small.Encode()); return ((index < files.size()) && icmp.user_comparator()->Compare( largest_user_key, files[index]->smallest.user_key()) >= 0); @@ -266,7 +266,7 @@ Status Version::Get(const ReadOptions& options, // Level-0 files may overlap each other. Find all files that // overlap user_key and process them in order from newest to oldest. tmp.reserve(num_files); - for (int i = 0; i < num_files; i++) { + for (uint32_t i = 0; i < num_files; i++) { FileMetaData* f = files[i]; if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 && ucmp->Compare(user_key, f->largest.user_key()) <= 0) { @@ -297,7 +297,7 @@ Status Version::Get(const ReadOptions& options, } } - for (int i = 0; i < num_files; ++i) { + for (uint32_t i = 0; i < num_files; ++i) { if (last_file_read != NULL && stats->seek_file == NULL) { // We have had more than one seek for this read. Charge the 1st file. stats->seek_file = last_file_read; @@ -442,7 +442,7 @@ class VersionSet::Builder { to_unref.push_back(*it); } delete added; - for (int i = 0; i < to_unref.size(); i++) { + for (uint32_t i = 0; i < to_unref.size(); i++) { FileMetaData* f = to_unref[i]; f->refs--; if (f->refs <= 0) { @@ -533,7 +533,7 @@ class VersionSet::Builder { #ifndef NDEBUG // Make sure there is no overlap in levels > 0 if (level > 0) { - for (int i = 1; i < v->files_[level].size(); i++) { + for (uint32_t i = 1; i < v->files_[level].size(); i++) { const InternalKey& prev_end = v->files_[level][i-1]->largest; const InternalKey& this_begin = v->files_[level][i]->smallest; if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) { diff --git a/util/cache.cc b/util/cache.cc index 5829b79..5cff3dd 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -112,7 +112,7 @@ class HandleTable { LRUHandle** new_list = new LRUHandle*[new_length]; memset(new_list, 0, sizeof(new_list[0]) * new_length); uint32_t count = 0; - for (int i = 0; i < length_; i++) { + for (uint32_t i = 0; i < length_; i++) { LRUHandle* h = list_[i]; while (h != NULL) { LRUHandle* next = h->next_hash; -- cgit v1.2.3 From 7e50a01f8a2820ed7a50c673b5affe0131560ff5 Mon Sep 17 00:00:00 2001 From: "gabor@google.com" Date: Tue, 16 Aug 2011 01:21:01 +0000 Subject: Bugfixes for iterator and documentation. - Fix bug in Iterator::Prev where it would return the wrong key. Fixes issues 29 and 30. - Added a tweak to testharness to allow running just some tests. - Fixing two minor documentation errors based on issues 28 and 25. - Cleanup; fix namespaces of export-to-C code. Also fix one "const char*" vs "char*" mismatch. git-svn-id: http://leveldb.googlecode.com/svn/trunk@48 62dab493-f737-651d-591e-8d6aee1b9529 --- db/c.cc | 25 +++++++++++++++++++++---- db/db_iter.cc | 3 ++- db/db_test.cc | 15 +++++++++++++++ doc/index.html | 10 +++++----- util/testharness.cc | 12 ++++++++++++ util/testharness.h | 11 ++++++++++- 6 files changed, 65 insertions(+), 11 deletions(-) diff --git a/db/c.cc b/db/c.cc index ee8a472..366dd2d 100644 --- a/db/c.cc +++ b/db/c.cc @@ -15,7 +15,26 @@ #include "leveldb/status.h" #include "leveldb/write_batch.h" -namespace leveldb { +using leveldb::Cache; +using leveldb::Comparator; +using leveldb::CompressionType; +using leveldb::DB; +using leveldb::Env; +using leveldb::FileLock; +using leveldb::Iterator; +using leveldb::Logger; +using leveldb::NewLRUCache; +using leveldb::Options; +using leveldb::RandomAccessFile; +using leveldb::Range; +using leveldb::ReadOptions; +using leveldb::SequentialFile; +using leveldb::Slice; +using leveldb::Snapshot; +using leveldb::Status; +using leveldb::WritableFile; +using leveldb::WriteBatch; +using leveldb::WriteOptions; extern "C" { @@ -172,7 +191,7 @@ void leveldb_release_snapshot( delete snapshot; } -const char* leveldb_property_value( +char* leveldb_property_value( leveldb_t* db, const char* propname) { std::string tmp; @@ -449,5 +468,3 @@ void leveldb_env_destroy(leveldb_env_t* env) { } } // end extern "C" - -} diff --git a/db/db_iter.cc b/db/db_iter.cc index 0be18ff..8849f92 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -216,7 +216,6 @@ void DBIter::FindPrevUserEntry() { ValueType value_type = kTypeDeletion; if (iter_->Valid()) { - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); do { ParsedInternalKey ikey; if (ParseKey(&ikey) && ikey.sequence <= sequence_) { @@ -227,6 +226,7 @@ void DBIter::FindPrevUserEntry() { } value_type = ikey.type; if (value_type == kTypeDeletion) { + saved_key_.clear(); ClearSavedValue(); } else { Slice raw_value = iter_->value(); @@ -234,6 +234,7 @@ void DBIter::FindPrevUserEntry() { std::string empty; swap(empty, saved_value_); } + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); saved_value_.assign(raw_value.data(), raw_value.size()); } } diff --git a/db/db_test.cc b/db/db_test.cc index 22fa70c..14eb44d 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -519,6 +519,21 @@ TEST(DBTest, IterSmallAndLargeMix) { delete iter; } +TEST(DBTest, IterMultiWithDelete) { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Delete("b")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->Seek("c"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + delete iter; +} + TEST(DBTest, Recover) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("baz", "v5")); diff --git a/doc/index.html b/doc/index.html index 58442e8..8d03c45 100644 --- a/doc/index.html +++ b/doc/index.html @@ -23,7 +23,7 @@ creating it if necessary:

         #include <assert>
      -  #include "leveldb/include/db.h"
      +  #include "leveldb/db.h"
       
         leveldb::DB* db;
         leveldb::Options options;
      @@ -78,7 +78,7 @@ Such problems can be avoided by using the WriteBatch class to
       atomically apply a set of updates:
       

      -  #include "leveldb/include/write_batch.h"
      +  #include "leveldb/write_batch.h"
         ...
         std::string value;
         leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
      @@ -296,7 +296,7 @@ subclass of leveldb::Comparator that expresses these rules:
           }
       
           // Ignore the following methods for now:
      -    const char* Name() { return "TwoPartComparator"; }
      +    const char* Name() const { return "TwoPartComparator"; }
           void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
           void FindShortSuccessor(std::string*) const { }
         };
      @@ -333,7 +333,7 @@ version numbers found in the keys to decide how to interpret them.
       

      Performance

      Performance can be tuned by changing the default values of the -types defined in leveldb/include/options.h. +types defined in include/leveldb/options.h.

      Block size

      @@ -371,7 +371,7 @@ filesystem and each file stores a sequence of compressed blocks. If uncompressed block contents.

      -  #include "leveldb/include/cache.h"
      +  #include "leveldb/cache.h"
       
         leveldb::Options options;
         options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
      diff --git a/util/testharness.cc b/util/testharness.cc
      index b686ac3..6f42700 100644
      --- a/util/testharness.cc
      +++ b/util/testharness.cc
      @@ -4,6 +4,8 @@
       
       #include "util/testharness.h"
       
      +#include 
      +#include 
       #include 
       #include 
       
      @@ -32,10 +34,20 @@ bool RegisterTest(const char* base, const char* name, void (*func)()) {
       }
       
       int RunAllTests() {
      +  const char* matcher = getenv("LEVELDB_TESTS");
      +
         int num = 0;
         if (tests != NULL) {
           for (int i = 0; i < tests->size(); i++) {
             const Test& t = (*tests)[i];
      +      if (matcher != NULL) {
      +        std::string name = t.base;
      +        name.push_back('.');
      +        name.append(t.name);
      +        if (strstr(name.c_str(), matcher) == NULL) {
      +          continue;
      +        }
      +      }
             fprintf(stderr, "==== Test %s.%s\n", t.base, t.name);
             (*t.func)();
             ++num;
      diff --git a/util/testharness.h b/util/testharness.h
      index 13ab914..6f1a9c3 100644
      --- a/util/testharness.h
      +++ b/util/testharness.h
      @@ -15,7 +15,16 @@
       namespace leveldb {
       namespace test {
       
      -// Run all tests registered by the TEST() macro.
      +// Run some of the tests registered by the TEST() macro.  If the
      +// environment variable "LEVELDB_TESTS" is not set, runs all tests.
      +// Otherwise, runs only the tests whose name contains the value of
      +// "LEVELDB_TESTS" as a substring.  E.g., suppose the tests are:
      +//    TEST(Foo, Hello) { ... }
      +//    TEST(Foo, World) { ... }
      +// LEVELDB_TESTS=Hello will run the first test
      +// LEVELDB_TESTS=o     will run both tests
      +// LEVELDB_TESTS=Junk  will run no tests
      +//
       // Returns 0 if all tests pass.
       // Dies or returns a non-zero value if some test fails.
       extern int RunAllTests();
      -- 
      cgit v1.2.3
      
      
      From d36ce84e66c7d3cee978fbeb52721c30dfb842a5 Mon Sep 17 00:00:00 2001
      From: "gabor@google.com"
       
      Date: Mon, 22 Aug 2011 21:08:51 +0000
      Subject: Bugfix for issue 33; reduce lock contention in Get(), parallel
       benchmarks.
      
      - Fix for issue 33 (non-null-terminated result from
        leveldb_property_value())
      
      - Support for running multiple instances of a benchmark in parallel.
      
      - Reduce lock contention on Get():
        (1) Do not hold the lock while searching memtables.
        (2) Shard block and table caches 16-ways.
      
        Benchmark for evaluating this change:
        $ db_bench --benchmarks=fillseq1,readrandom --threads=$n
        (fillseq1 is a small hack to make sure fillseq runs once regardless
        of number of threads specified on the command line).
      
      
      
      git-svn-id: http://leveldb.googlecode.com/svn/trunk@49 62dab493-f737-651d-591e-8d6aee1b9529
      ---
       db/c.cc            |   3 +-
       db/db_bench.cc     | 530 +++++++++++++++++++++++++++++++++++------------------
       db/db_impl.cc      |  36 ++--
       util/cache.cc      | 149 +++++++++------
       util/cache_test.cc |  39 ++--
       util/histogram.cc  |  11 ++
       util/histogram.h   |   1 +
       7 files changed, 510 insertions(+), 259 deletions(-)
      
      diff --git a/db/c.cc b/db/c.cc
      index 366dd2d..038e5c0 100644
      --- a/db/c.cc
      +++ b/db/c.cc
      @@ -196,7 +196,8 @@ char* leveldb_property_value(
           const char* propname) {
         std::string tmp;
         if (db->rep->GetProperty(Slice(propname), &tmp)) {
      -    return CopyString(tmp);
      +    // We use strdup() since we expect human readable output.
      +    return strdup(tmp.c_str());
         } else {
           return NULL;
         }
      diff --git a/db/db_bench.cc b/db/db_bench.cc
      index 7b4e41a..d3ec61b 100644
      --- a/db/db_bench.cc
      +++ b/db/db_bench.cc
      @@ -14,6 +14,7 @@
       #include "port/port.h"
       #include "util/crc32c.h"
       #include "util/histogram.h"
      +#include "util/mutexlock.h"
       #include "util/random.h"
       #include "util/testutil.h"
       
      @@ -60,6 +61,9 @@ static int FLAGS_num = 1000000;
       // Number of read operations to do.  If negative, do FLAGS_num reads.
       static int FLAGS_reads = -1;
       
      +// Number of concurrent threads to run.
      +static int FLAGS_threads = 1;
      +
       // Size of each value
       static int FLAGS_value_size = 100;
       
      @@ -91,8 +95,9 @@ static const char* FLAGS_db = "/tmp/dbbench";
       
       namespace leveldb {
       
      -// Helper for quickly generating random data.
       namespace {
      +
      +// Helper for quickly generating random data.
       class RandomGenerator {
        private:
         std::string data_;
      @@ -136,6 +141,152 @@ static Slice TrimSpace(Slice s) {
         return Slice(s.data() + start, limit - start);
       }
       
      +static void AppendWithSpace(std::string* str, Slice msg) {
      +  if (msg.empty()) return;
      +  if (!str->empty()) {
      +    str->push_back(' ');
      +  }
      +  str->append(msg.data(), msg.size());
      +}
      +
      +class Stats {
      + private:
      +  double start_;
      +  double finish_;
      +  double seconds_;
      +  int done_;
      +  int next_report_;
      +  int64_t bytes_;
      +  double last_op_finish_;
      +  Histogram hist_;
      +  std::string message_;
      +
      + public:
      +  Stats() { Start(); }
      +
      +  void Start() {
      +    next_report_ = 100;
      +    last_op_finish_ = start_;
      +    hist_.Clear();
      +    done_ = 0;
      +    bytes_ = 0;
      +    seconds_ = 0;
      +    start_ = Env::Default()->NowMicros();
      +    finish_ = start_;
      +    message_.clear();
      +  }
      +
      +  void Merge(const Stats& other) {
      +    hist_.Merge(other.hist_);
      +    done_ += other.done_;
      +    bytes_ += other.bytes_;
      +    seconds_ += other.seconds_;
      +    if (other.start_ < start_) start_ = other.start_;
      +    if (other.finish_ > finish_) finish_ = other.finish_;
      +
      +    // Just keep the messages from one thread
      +    if (message_.empty()) message_ = other.message_;
      +  }
      +
      +  void Stop() {
      +    finish_ = Env::Default()->NowMicros();
      +    seconds_ = (finish_ - start_) * 1e-6;
      +  }
      +
      +  void AddMessage(Slice msg) {
      +    AppendWithSpace(&message_, msg);
      +  }
      +
      +  void FinishedSingleOp() {
      +    if (FLAGS_histogram) {
      +      double now = Env::Default()->NowMicros();
      +      double micros = now - last_op_finish_;
      +      hist_.Add(micros);
      +      if (micros > 20000) {
      +        fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
      +        fflush(stderr);
      +      }
      +      last_op_finish_ = now;
      +    }
      +
      +    done_++;
      +    if (done_ >= next_report_) {
      +      if      (next_report_ < 1000)   next_report_ += 100;
      +      else if (next_report_ < 5000)   next_report_ += 500;
      +      else if (next_report_ < 10000)  next_report_ += 1000;
      +      else if (next_report_ < 50000)  next_report_ += 5000;
      +      else if (next_report_ < 100000) next_report_ += 10000;
      +      else if (next_report_ < 500000) next_report_ += 50000;
      +      else                            next_report_ += 100000;
      +      fprintf(stderr, "... finished %d ops%30s\r", done_, "");
      +      fflush(stderr);
      +    }
      +  }
      +
      +  void AddBytes(int64_t n) {
      +    bytes_ += n;
      +  }
      +
      +  void Report(const Slice& name) {
      +    // Pretend at least one op was done in case we are running a benchmark
      +    // that does not call FinishedSingleOp().
      +    if (done_ < 1) done_ = 1;
      +
      +    std::string extra;
      +    if (bytes_ > 0) {
      +      // Rate is computed on actual elapsed time, not the sum of per-thread
      +      // elapsed times.
      +      double elapsed = (finish_ - start_) * 1e-6;
      +      char rate[100];
      +      snprintf(rate, sizeof(rate), "%6.1f MB/s",
      +               (bytes_ / 1048576.0) / elapsed);
      +      extra = rate;
      +    }
      +    AppendWithSpace(&extra, message_);
      +
      +    fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
      +            name.ToString().c_str(),
      +            seconds_ * 1e6 / done_,
      +            (extra.empty() ? "" : " "),
      +            extra.c_str());
      +    if (FLAGS_histogram) {
      +      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
      +    }
      +    fflush(stdout);
      +  }
      +};
      +
      +// State shared by all concurrent executions of the same benchmark.
      +struct SharedState {
      +  port::Mutex mu;
      +  port::CondVar cv;
      +  int total;
      +
      +  // Each thread goes through the following states:
      +  //    (1) initializing
      +  //    (2) waiting for others to be initialized
      +  //    (3) running
      +  //    (4) done
      +
      +  int num_initialized;
      +  int num_done;
      +  bool start;
      +
      +  SharedState() : cv(&mu) { }
      +};
      +
      +// Per-thread state for concurrent executions of the same benchmark.
      +struct ThreadState {
      +  int tid;             // 0..n-1 when running in n threads
      +  Random rand;         // Has different seeds for different threads
      +  Stats stats;
      +
      +  ThreadState(int index)
      +      : tid(index),
      +        rand(1000 + index) {
      +  }
      +};
      +
       }
       
       class Benchmark {
      @@ -143,20 +294,11 @@ class Benchmark {
         Cache* cache_;
         DB* db_;
         int num_;
      +  int value_size_;
      +  int entries_per_batch_;
      +  WriteOptions write_options_;
         int reads_;
         int heap_counter_;
      -  double start_;
      -  double last_op_finish_;
      -  int64_t bytes_;
      -  std::string message_;
      -  std::string post_message_;
      -  Histogram hist_;
      -  RandomGenerator gen_;
      -  Random rand_;
      -
      -  // State kept for progress messages
      -  int done_;
      -  int next_report_;     // When to report next
       
         void PrintHeader() {
           const int kKeySize = 16;
      @@ -232,94 +374,15 @@ class Benchmark {
       #endif
         }
       
      -  void Start() {
      -    start_ = Env::Default()->NowMicros() * 1e-6;
      -    bytes_ = 0;
      -    message_.clear();
      -    last_op_finish_ = start_;
      -    hist_.Clear();
      -    done_ = 0;
      -    next_report_ = 100;
      -  }
      -
      -  void FinishedSingleOp() {
      -    if (FLAGS_histogram) {
      -      double now = Env::Default()->NowMicros() * 1e-6;
      -      double micros = (now - last_op_finish_) * 1e6;
      -      hist_.Add(micros);
      -      if (micros > 20000) {
      -        fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
      -        fflush(stderr);
      -      }
      -      last_op_finish_ = now;
      -    }
      -
      -    done_++;
      -    if (done_ >= next_report_) {
      -      if      (next_report_ < 1000)   next_report_ += 100;
      -      else if (next_report_ < 5000)   next_report_ += 500;
      -      else if (next_report_ < 10000)  next_report_ += 1000;
      -      else if (next_report_ < 50000)  next_report_ += 5000;
      -      else if (next_report_ < 100000) next_report_ += 10000;
      -      else if (next_report_ < 500000) next_report_ += 50000;
      -      else                            next_report_ += 100000;
      -      fprintf(stderr, "... finished %d ops%30s\r", done_, "");
      -      fflush(stderr);
      -    }
      -  }
      -
      -  void Stop(const Slice& name) {
      -    double finish = Env::Default()->NowMicros() * 1e-6;
      -
      -    // Pretend at least one op was done in case we are running a benchmark
      -    // that does nto call FinishedSingleOp().
      -    if (done_ < 1) done_ = 1;
      -
      -    if (bytes_ > 0) {
      -      char rate[100];
      -      snprintf(rate, sizeof(rate), "%6.1f MB/s",
      -               (bytes_ / 1048576.0) / (finish - start_));
      -      if (!message_.empty()) {
      -        message_  = std::string(rate) + " " + message_;
      -      } else {
      -        message_ = rate;
      -      }
      -    }
      -
      -    fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
      -            name.ToString().c_str(),
      -            (finish - start_) * 1e6 / done_,
      -            (message_.empty() ? "" : " "),
      -            message_.c_str());
      -    if (FLAGS_histogram) {
      -      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
      -    }
      -    fflush(stdout);
      -
      -    if (!post_message_.empty()) {
      -      fprintf(stdout, "\n%s\n", post_message_.c_str());
      -      post_message_.clear();
      -    }
      -  }
      -
        public:
      -  enum Order {
      -    SEQUENTIAL,
      -    RANDOM
      -  };
      -  enum DBState {
      -    FRESH,
      -    EXISTING
      -  };
      -
         Benchmark()
         : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
           db_(NULL),
           num_(FLAGS_num),
      +    value_size_(FLAGS_value_size),
      +    entries_per_batch_(1),
           reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
      -    heap_counter_(0),
      -    bytes_(0),
      -    rand_(301) {
      +    heap_counter_(0) {
           std::vector files;
           Env::Default()->GetChildren(FLAGS_db, &files);
           for (int i = 0; i < files.size(); i++) {
      @@ -353,98 +416,203 @@ class Benchmark {
               benchmarks = sep + 1;
             }
       
      -      Start();
      +      // Reset parameters that may be overriddden bwlow
      +      num_ = FLAGS_num;
      +      reads_ = num_;
      +      value_size_ = FLAGS_value_size;
      +      entries_per_batch_ = 1;
      +      write_options_ = WriteOptions();
      +
      +      void (Benchmark::*method)(ThreadState*) = NULL;
      +      bool fresh_db = false;
       
      -      WriteOptions write_options;
      -      bool known = true;
             if (name == Slice("fillseq")) {
      -        Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
      +        fresh_db = true;
      +        method = &Benchmark::WriteSeq;
             } else if (name == Slice("fillbatch")) {
      -        Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000);
      +        fresh_db = true;
      +        entries_per_batch_ = 1000;
      +        method = &Benchmark::WriteSeq;
             } else if (name == Slice("fillrandom")) {
      -        Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1);
      +        fresh_db = true;
      +        method = &Benchmark::WriteRandom;
             } else if (name == Slice("overwrite")) {
      -        Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1);
      +        fresh_db = false;
      +        method = &Benchmark::WriteRandom;
             } else if (name == Slice("fillsync")) {
      -        write_options.sync = true;
      -        Write(write_options, RANDOM, FRESH, num_ / 1000, FLAGS_value_size, 1);
      +        fresh_db = true;
      +        num_ /= 1000;
      +        write_options_.sync = true;
      +        method = &Benchmark::WriteRandom;
             } else if (name == Slice("fill100K")) {
      -        Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1);
      +        fresh_db = true;
      +        num_ /= 1000;
      +        value_size_ = 100 * 1000;
      +        method = &Benchmark::WriteRandom;
             } else if (name == Slice("readseq")) {
      -        ReadSequential();
      +        method = &Benchmark::ReadSequential;
             } else if (name == Slice("readreverse")) {
      -        ReadReverse();
      +        method = &Benchmark::ReadReverse;
             } else if (name == Slice("readrandom")) {
      -        ReadRandom();
      +        method = &Benchmark::ReadRandom;
             } else if (name == Slice("readhot")) {
      -        ReadHot();
      +        method = &Benchmark::ReadHot;
             } else if (name == Slice("readrandomsmall")) {
      -        int n = reads_;
               reads_ /= 1000;
      -        ReadRandom();
      -        reads_ = n;
      +        method = &Benchmark::ReadRandom;
             } else if (name == Slice("compact")) {
      -        Compact();
      +        method = &Benchmark::Compact;
             } else if (name == Slice("crc32c")) {
      -        Crc32c(4096, "(4K per op)");
      +        method = &Benchmark::Crc32c;
             } else if (name == Slice("acquireload")) {
      -        AcquireLoad();
      +        method = &Benchmark::AcquireLoad;
             } else if (name == Slice("snappycomp")) {
      -        SnappyCompress();
      +        method = &Benchmark::SnappyCompress;
             } else if (name == Slice("snappyuncomp")) {
      -        SnappyUncompress();
      +        method = &Benchmark::SnappyUncompress;
             } else if (name == Slice("heapprofile")) {
               HeapProfile();
             } else if (name == Slice("stats")) {
               PrintStats();
             } else {
      -        known = false;
               if (name != Slice()) {  // No error message for empty name
                 fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
               }
             }
      -      if (known) {
      -        Stop(name);
      +
      +      if (fresh_db) {
      +        if (FLAGS_use_existing_db) {
      +          fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
      +                  name.ToString().c_str());
      +          method = NULL;
      +        } else {
      +          delete db_;
      +          db_ = NULL;
      +          DestroyDB(FLAGS_db, Options());
      +          Open();
      +        }
      +      }
      +
      +      if (method != NULL) {
      +        RunBenchmark(name, method);
             }
           }
         }
       
        private:
      -  void Crc32c(int size, const char* label) {
      +  struct ThreadArg {
      +    Benchmark* bm;
      +    SharedState* shared;
      +    ThreadState* thread;
      +    void (Benchmark::*method)(ThreadState*);
      +  };
      +
      +  static void ThreadBody(void* v) {
      +    ThreadArg* arg = reinterpret_cast(v);
      +    SharedState* shared = arg->shared;
      +    ThreadState* thread = arg->thread;
      +    {
      +      MutexLock l(&shared->mu);
      +      shared->num_initialized++;
      +      if (shared->num_initialized >= shared->total) {
      +        shared->cv.SignalAll();
      +      }
      +      while (!shared->start) {
      +        shared->cv.Wait();
      +      }
      +    }
      +
      +    thread->stats.Start();
      +    (arg->bm->*(arg->method))(thread);
      +    thread->stats.Stop();
      +
      +    {
      +      MutexLock l(&shared->mu);
      +      shared->num_done++;
      +      if (shared->num_done >= shared->total) {
      +        shared->cv.SignalAll();
      +      }
      +    }
      +  }
      +
      +  void RunBenchmark(Slice name, void (Benchmark::*method)(ThreadState*)) {
      +    const int n = FLAGS_threads;
      +    SharedState shared;
      +    shared.total = n;
      +    shared.num_initialized = 0;
      +    shared.num_done = 0;
      +    shared.start = false;
      +
      +    ThreadArg* arg = new ThreadArg[n];
      +    for (int i = 0; i < n; i++) {
      +      arg[i].bm = this;
      +      arg[i].method = method;
      +      arg[i].shared = &shared;
      +      arg[i].thread = new ThreadState(i);
      +      Env::Default()->StartThread(ThreadBody, &arg[i]);
      +    }
      +
      +    shared.mu.Lock();
      +    while (shared.num_initialized < n) {
      +      shared.cv.Wait();
      +    }
      +
      +    shared.start = true;
      +    shared.cv.SignalAll();
      +    while (shared.num_done < n) {
      +      shared.cv.Wait();
      +    }
      +    shared.mu.Unlock();
      +
      +    for (int i = 1; i < n; i++) {
      +      arg[0].thread->stats.Merge(arg[i].thread->stats);
      +    }
      +    arg[0].thread->stats.Report(name);
      +
      +    for (int i = 0; i < n; i++) {
      +      delete arg[i].thread;
      +    }
      +    delete[] arg;
      +  }
      +
      +  void Crc32c(ThreadState* thread) {
           // Checksum about 500MB of data total
      +    const int size = 4096;
      +    const char* label = "(4K per op)";
           std::string data(size, 'x');
           int64_t bytes = 0;
           uint32_t crc = 0;
           while (bytes < 500 * 1048576) {
             crc = crc32c::Value(data.data(), size);
      -      FinishedSingleOp();
      +      thread->stats.FinishedSingleOp();
             bytes += size;
           }
           // Print so result is not dead
           fprintf(stderr, "... crc=0x%x\r", static_cast(crc));
       
      -    bytes_ = bytes;
      -    message_ = label;
      +    thread->stats.AddBytes(bytes);
      +    thread->stats.AddMessage(label);
         }
       
      -  void AcquireLoad() {
      +  void AcquireLoad(ThreadState* thread) {
           int dummy;
           port::AtomicPointer ap(&dummy);
           int count = 0;
           void *ptr = NULL;
      -    message_ = "(each op is 1000 loads)";
      +    thread->stats.AddMessage("(each op is 1000 loads)");
           while (count < 100000) {
             for (int i = 0; i < 1000; i++) {
               ptr = ap.Acquire_Load();
             }
             count++;
      -      FinishedSingleOp();
      +      thread->stats.FinishedSingleOp();
           }
           if (ptr == NULL) exit(1); // Disable unused variable warning.
         }
       
      -  void SnappyCompress() {
      -    Slice input = gen_.Generate(Options().block_size);
      +  void SnappyCompress(ThreadState* thread) {
      +    RandomGenerator gen;
      +    Slice input = gen.Generate(Options().block_size);
           int64_t bytes = 0;
           int64_t produced = 0;
           bool ok = true;
      @@ -453,22 +621,23 @@ class Benchmark {
             ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
             produced += compressed.size();
             bytes += input.size();
      -      FinishedSingleOp();
      +      thread->stats.FinishedSingleOp();
           }
       
           if (!ok) {
      -      message_ = "(snappy failure)";
      +      thread->stats.AddMessage("(snappy failure)");
           } else {
             char buf[100];
             snprintf(buf, sizeof(buf), "(output: %.1f%%)",
                      (produced * 100.0) / bytes);
      -      message_ = buf;
      -      bytes_ = bytes;
      +      thread->stats.AddMessage(buf);
      +      thread->stats.AddBytes(bytes);
           }
         }
       
      -  void SnappyUncompress() {
      -    Slice input = gen_.Generate(Options().block_size);
      +  void SnappyUncompress(ThreadState* thread) {
      +    RandomGenerator gen;
      +    Slice input = gen.Generate(Options().block_size);
           std::string compressed;
           bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
           int64_t bytes = 0;
      @@ -477,14 +646,14 @@ class Benchmark {
             ok =  port::Snappy_Uncompress(compressed.data(), compressed.size(),
                                           uncompressed);
             bytes += input.size();
      -      FinishedSingleOp();
      +      thread->stats.FinishedSingleOp();
           }
           delete[] uncompressed;
       
           if (!ok) {
      -      message_ = "(snappy failure)";
      +      thread->stats.AddMessage("(snappy failure)");
           } else {
      -      bytes_ = bytes;
      +      thread->stats.AddBytes(bytes);
           }
         }
       
      @@ -501,95 +670,97 @@ class Benchmark {
           }
         }
       
      -  void Write(const WriteOptions& options, Order order, DBState state,
      -             int num_entries, int value_size, int entries_per_batch) {
      -    if (state == FRESH) {
      -      if (FLAGS_use_existing_db) {
      -        message_ = "skipping (--use_existing_db is true)";
      -        return;
      -      }
      -      delete db_;
      -      db_ = NULL;
      -      DestroyDB(FLAGS_db, Options());
      -      Open();
      -      Start();  // Do not count time taken to destroy/open
      -    }
      +  void WriteSeq(ThreadState* thread) {
      +    DoWrite(thread, true);
      +  }
       
      -    if (num_entries != num_) {
      +  void WriteRandom(ThreadState* thread) {
      +    DoWrite(thread, false);
      +  }
      +
      +  void DoWrite(ThreadState* thread, bool seq) {
      +    if (num_ != FLAGS_num) {
             char msg[100];
      -      snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
      -      message_ = msg;
      +      snprintf(msg, sizeof(msg), "(%d ops)", num_);
      +      thread->stats.AddMessage(msg);
           }
       
      +    RandomGenerator gen;
           WriteBatch batch;
           Status s;
           std::string val;
      -    for (int i = 0; i < num_entries; i += entries_per_batch) {
      +    int64_t bytes = 0;
      +    for (int i = 0; i < num_; i += entries_per_batch_) {
             batch.Clear();
      -      for (int j = 0; j < entries_per_batch; j++) {
      -        const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num);
      +      for (int j = 0; j < entries_per_batch_; j++) {
      +        const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
               char key[100];
               snprintf(key, sizeof(key), "%016d", k);
      -        batch.Put(key, gen_.Generate(value_size));
      -        bytes_ += value_size + strlen(key);
      -        FinishedSingleOp();
      +        batch.Put(key, gen.Generate(value_size_));
      +        bytes += value_size_ + strlen(key);
      +        thread->stats.FinishedSingleOp();
             }
      -      s = db_->Write(options, &batch);
      +      s = db_->Write(write_options_, &batch);
             if (!s.ok()) {
               fprintf(stderr, "put error: %s\n", s.ToString().c_str());
               exit(1);
             }
           }
      +    thread->stats.AddBytes(bytes);
         }
       
      -  void ReadSequential() {
      +  void ReadSequential(ThreadState* thread) {
           Iterator* iter = db_->NewIterator(ReadOptions());
           int i = 0;
      +    int64_t bytes = 0;
           for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
      -      bytes_ += iter->key().size() + iter->value().size();
      -      FinishedSingleOp();
      +      bytes += iter->key().size() + iter->value().size();
      +      thread->stats.FinishedSingleOp();
             ++i;
           }
           delete iter;
      +    thread->stats.AddBytes(bytes);
         }
       
      -  void ReadReverse() {
      +  void ReadReverse(ThreadState* thread) {
           Iterator* iter = db_->NewIterator(ReadOptions());
           int i = 0;
      +    int64_t bytes = 0;
           for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
      -      bytes_ += iter->key().size() + iter->value().size();
      -      FinishedSingleOp();
      +      bytes += iter->key().size() + iter->value().size();
      +      thread->stats.FinishedSingleOp();
             ++i;
           }
           delete iter;
      +    thread->stats.AddBytes(bytes);
         }
       
      -  void ReadRandom() {
      +  void ReadRandom(ThreadState* thread) {
           ReadOptions options;
           std::string value;
           for (int i = 0; i < reads_; i++) {
             char key[100];
      -      const int k = rand_.Next() % FLAGS_num;
      +      const int k = thread->rand.Next() % FLAGS_num;
             snprintf(key, sizeof(key), "%016d", k);
             db_->Get(options, key, &value);
      -      FinishedSingleOp();
      +      thread->stats.FinishedSingleOp();
           }
         }
       
      -  void ReadHot() {
      +  void ReadHot(ThreadState* thread) {
           ReadOptions options;
           std::string value;
           const int range = (FLAGS_num + 99) / 100;
           for (int i = 0; i < reads_; i++) {
             char key[100];
      -      const int k = rand_.Next() % range;
      +      const int k = thread->rand.Next() % range;
             snprintf(key, sizeof(key), "%016d", k);
             db_->Get(options, key, &value);
      -      FinishedSingleOp();
      +      thread->stats.FinishedSingleOp();
           }
         }
       
      -  void Compact() {
      +  void Compact(ThreadState* thread) {
           DBImpl* dbi = reinterpret_cast(db_);
           dbi->TEST_CompactMemTable();
           int max_level_with_files = 1;
      @@ -609,10 +780,9 @@ class Benchmark {
         void PrintStats() {
           std::string stats;
           if (!db_->GetProperty("leveldb.stats", &stats)) {
      -      message_ = "(failed)";
      -    } else {
      -      post_message_ = stats;
      +      stats = "(failed)";
           }
      +    fprintf(stdout, "\n%s\n", stats.c_str());
         }
       
         static void WriteToFile(void* arg, const char* buf, int n) {
      @@ -625,13 +795,13 @@ class Benchmark {
           WritableFile* file;
           Status s = Env::Default()->NewWritableFile(fname, &file);
           if (!s.ok()) {
      -      message_ = s.ToString();
      +      fprintf(stderr, "%s\n", s.ToString().c_str());
             return;
           }
           bool ok = port::GetHeapProfile(WriteToFile, file);
           delete file;
           if (!ok) {
      -      message_ = "not supported";
      +      fprintf(stderr, "heap profiling not supported\n");
             Env::Default()->DeleteFile(fname);
           }
         }
      @@ -661,6 +831,8 @@ int main(int argc, char** argv) {
             FLAGS_num = n;
           } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
             FLAGS_reads = n;
      +    } else if (sscanf(argv[i], "--threads=%d%c", &n, &junk) == 1) {
      +      FLAGS_threads = n;
           } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
             FLAGS_value_size = n;
           } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
      diff --git a/db/db_impl.cc b/db/db_impl.cc
      index fff4eaf..c4c6a61 100644
      --- a/db/db_impl.cc
      +++ b/db/db_impl.cc
      @@ -989,27 +989,37 @@ Status DBImpl::Get(const ReadOptions& options,
           snapshot = versions_->LastSequence();
         }
       
      -  // First look in the memtable, then in the immutable memtable (if any).
      -  LookupKey lkey(key, snapshot);
      -  if (mem_->Get(lkey, value, &s)) {
      -    return s;
      -  }
      -  if (imm_ != NULL && imm_->Get(lkey, value, &s)) {
      -    return s;
      -  }
      -
      -  // Not in memtable(s); try live files in level order
      +  MemTable* mem = mem_;
      +  MemTable* imm = imm_;
         Version* current = versions_->current();
      +  mem->Ref();
      +  if (imm != NULL) imm->Ref();
         current->Ref();
      +
      +  bool have_stat_update = false;
         Version::GetStats stats;
      -  { // Unlock while reading from files
      +
      +  // Unlock while reading from files and memtables
      +  {
           mutex_.Unlock();
      -    s = current->Get(options, lkey, value, &stats);
      +    // First look in the memtable, then in the immutable memtable (if any).
      +    LookupKey lkey(key, snapshot);
      +    if (mem_->Get(lkey, value, &s)) {
      +      // Done
      +    } else if (imm_ != NULL && imm_->Get(lkey, value, &s)) {
      +      // Done
      +    } else {
      +      s = current->Get(options, lkey, value, &stats);
      +      have_stat_update = true;
      +    }
           mutex_.Lock();
         }
      -  if (current->UpdateStats(stats)) {
      +
      +  if (have_stat_update && current->UpdateStats(stats)) {
           MaybeScheduleCompaction();
         }
      +  mem->Unref();
      +  if (imm != NULL) imm->Unref();
         current->Unref();
         return s;
       }
      diff --git a/util/cache.cc b/util/cache.cc
      index 5cff3dd..ce99f08 100644
      --- a/util/cache.cc
      +++ b/util/cache.cc
      @@ -30,7 +30,8 @@ struct LRUHandle {
         LRUHandle* prev;
         size_t charge;      // TODO(opt): Only allow uint32_t?
         size_t key_length;
      -  size_t refs;        // TODO(opt): Pack with "key_length"?
      +  uint32_t refs;
      +  uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
         char key_data[1];   // Beginning of key
       
         Slice key() const {
      @@ -54,12 +55,12 @@ class HandleTable {
         HandleTable() : length_(0), elems_(0), list_(NULL) { Resize(); }
         ~HandleTable() { delete[] list_; }
       
      -  LRUHandle* Lookup(LRUHandle* h) {
      -    return *FindPointer(h);
      +  LRUHandle* Lookup(const Slice& key, uint32_t hash) {
      +    return *FindPointer(key, hash);
         }
       
         LRUHandle* Insert(LRUHandle* h) {
      -    LRUHandle** ptr = FindPointer(h);
      +    LRUHandle** ptr = FindPointer(h->key(), h->hash);
           LRUHandle* old = *ptr;
           h->next_hash = (old == NULL ? NULL : old->next_hash);
           *ptr = h;
      @@ -74,8 +75,8 @@ class HandleTable {
           return old;
         }
       
      -  LRUHandle* Remove(LRUHandle* h) {
      -    LRUHandle** ptr = FindPointer(h);
      +  LRUHandle* Remove(const Slice& key, uint32_t hash) {
      +    LRUHandle** ptr = FindPointer(key, hash);
           LRUHandle* result = *ptr;
           if (result != NULL) {
             *ptr = result->next_hash;
      @@ -92,13 +93,12 @@ class HandleTable {
         LRUHandle** list_;
       
         // Return a pointer to slot that points to a cache entry that
      -  // matches *h.  If there is no such cache entry, return a pointer to
      -  // the trailing slot in the corresponding linked list.
      -  LRUHandle** FindPointer(LRUHandle* h) {
      -    Slice key = h->key();
      -    uint32_t hash = Hash(key.data(), key.size(), 0);
      +  // matches key/hash.  If there is no such cache entry, return a
      +  // pointer to the trailing slot in the corresponding linked list.
      +  LRUHandle** FindPointer(const Slice& key, uint32_t hash) {
           LRUHandle** ptr = &list_[hash & (length_ - 1)];
      -    while (*ptr != NULL && key != (*ptr)->key()) {
      +    while (*ptr != NULL &&
      +           ((*ptr)->hash != hash || key != (*ptr)->key())) {
             ptr = &(*ptr)->next_hash;
           }
           return ptr;
      @@ -117,7 +117,7 @@ class HandleTable {
             while (h != NULL) {
               LRUHandle* next = h->next_hash;
               Slice key = h->key();
      -        uint32_t hash = Hash(key.data(), key.size(), 0);
      +        uint32_t hash = h->hash;
               LRUHandle** ptr = &new_list[hash & (new_length - 1)];
               h->next_hash = *ptr;
               *ptr = h;
      @@ -132,26 +132,30 @@ class HandleTable {
         }
       };
       
      -class LRUCache : public Cache {
      +// A single shard of sharded cache.
      +class LRUCache {
        public:
      -  explicit LRUCache(size_t capacity);
      -  virtual ~LRUCache();
      +  LRUCache();
      +  ~LRUCache();
       
      -  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
      -                         void (*deleter)(const Slice& key, void* value));
      -  virtual Handle* Lookup(const Slice& key);
      -  virtual void Release(Handle* handle);
      -  virtual void* Value(Handle* handle);
      -  virtual void Erase(const Slice& key);
      -  virtual uint64_t NewId();
      +  // Separate from constructor so caller can easily make an array of LRUCache
      +  void SetCapacity(size_t capacity) { capacity_ = capacity; }
      +
      +  // Like Cache methods, but with an extra "hash" parameter.
      +  Cache::Handle* Insert(const Slice& key, uint32_t hash,
      +                        void* value, size_t charge,
      +                        void (*deleter)(const Slice& key, void* value));
      +  Cache::Handle* Lookup(const Slice& key, uint32_t hash);
      +  void Release(Cache::Handle* handle);
      +  void Erase(const Slice& key, uint32_t hash);
       
        private:
         void LRU_Remove(LRUHandle* e);
         void LRU_Append(LRUHandle* e);
         void Unref(LRUHandle* e);
       
      -  // Constructor parameters
      -  const size_t capacity_;
      +  // Initialized before use.
      +  size_t capacity_;
       
         // mutex_ protects the following state.
         port::Mutex mutex_;
      @@ -165,9 +169,8 @@ class LRUCache : public Cache {
         HandleTable table_;
       };
       
      -LRUCache::LRUCache(size_t capacity)
      -    : capacity_(capacity),
      -      usage_(0),
      +LRUCache::LRUCache()
      +    : usage_(0),
             last_id_(0) {
         // Make empty circular linked list
         lru_.next = &lru_;
      @@ -206,32 +209,25 @@ void LRUCache::LRU_Append(LRUHandle* e) {
         e->next->prev = e;
       }
       
      -Cache::Handle* LRUCache::Lookup(const Slice& key) {
      +Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
         MutexLock l(&mutex_);
      -
      -  LRUHandle dummy;
      -  dummy.next = &dummy;
      -  dummy.value = const_cast(&key);
      -  LRUHandle* e = table_.Lookup(&dummy);
      +  LRUHandle* e = table_.Lookup(key, hash);
         if (e != NULL) {
           e->refs++;
           LRU_Remove(e);
           LRU_Append(e);
         }
      -  return reinterpret_cast(e);
      +  return reinterpret_cast(e);
       }
       
      -void* LRUCache::Value(Handle* handle) {
      -  return reinterpret_cast(handle)->value;
      -}
      -
      -void LRUCache::Release(Handle* handle) {
      +void LRUCache::Release(Cache::Handle* handle) {
         MutexLock l(&mutex_);
         Unref(reinterpret_cast(handle));
       }
       
      -Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge,
      -                             void (*deleter)(const Slice& key, void* value)) {
      +Cache::Handle* LRUCache::Insert(
      +    const Slice& key, uint32_t hash, void* value, size_t charge,
      +    void (*deleter)(const Slice& key, void* value)) {
         MutexLock l(&mutex_);
       
         LRUHandle* e = reinterpret_cast(
      @@ -240,6 +236,7 @@ Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge,
         e->deleter = deleter;
         e->charge = charge;
         e->key_length = key.size();
      +  e->hash = hash;
         e->refs = 2;  // One from LRUCache, one for the returned handle
         memcpy(e->key_data, key.data(), key.size());
         LRU_Append(e);
      @@ -254,35 +251,77 @@ Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge,
         while (usage_ > capacity_ && lru_.next != &lru_) {
           LRUHandle* old = lru_.next;
           LRU_Remove(old);
      -    table_.Remove(old);
      +    table_.Remove(old->key(), old->hash);
           Unref(old);
         }
       
      -  return reinterpret_cast(e);
      +  return reinterpret_cast(e);
       }
       
      -void LRUCache::Erase(const Slice& key) {
      +void LRUCache::Erase(const Slice& key, uint32_t hash) {
         MutexLock l(&mutex_);
      -
      -  LRUHandle dummy;
      -  dummy.next = &dummy;
      -  dummy.value = const_cast(&key);
      -  LRUHandle* e = table_.Remove(&dummy);
      +  LRUHandle* e = table_.Remove(key, hash);
         if (e != NULL) {
           LRU_Remove(e);
           Unref(e);
         }
       }
       
      -uint64_t LRUCache::NewId() {
      -  MutexLock l(&mutex_);
      -  return ++(last_id_);
      -}
      +static const int kNumShardBits = 4;
      +static const int kNumShards = 1 << kNumShardBits;
      +
      +class ShardedLRUCache : public Cache {
      + private:
      +  LRUCache shard_[kNumShards];
      +  port::Mutex id_mutex_;
      +  uint64_t last_id_;
      +
      +  static inline uint32_t HashSlice(const Slice& s) {
      +    return Hash(s.data(), s.size(), 0);
      +  }
      +
      +  static uint32_t Shard(uint32_t hash) {
      +    return hash >> (32 - kNumShardBits);
      +  }
      +
      + public:
      +  explicit ShardedLRUCache(size_t capacity) {
      +    const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards;
      +    for (int s = 0; s < kNumShards; s++) {
      +      shard_[s].SetCapacity(per_shard);
      +    }
      +  }
      +  virtual ~ShardedLRUCache() { }
      +  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
      +                         void (*deleter)(const Slice& key, void* value)) {
      +    const uint32_t hash = HashSlice(key);
      +    return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
      +  }
      +  virtual Handle* Lookup(const Slice& key) {
      +    const uint32_t hash = HashSlice(key);
      +    return shard_[Shard(hash)].Lookup(key, hash);
      +  }
      +  virtual void Release(Handle* handle) {
      +    LRUHandle* h = reinterpret_cast(handle);
      +    shard_[Shard(h->hash)].Release(handle);
      +  }
      +  virtual void Erase(const Slice& key) {
      +    const uint32_t hash = HashSlice(key);
      +    shard_[Shard(hash)].Erase(key, hash);
      +  }
      +  virtual void* Value(Handle* handle) {
      +    return reinterpret_cast(handle)->value;
      +  }
      +  virtual uint64_t NewId() {
      +    MutexLock l(&id_mutex_);
      +    return ++(last_id_);
      +  }
      +};
       
       }  // end anonymous namespace
       
       Cache* NewLRUCache(size_t capacity) {
      -  return new LRUCache(capacity);
      +  return new ShardedLRUCache(capacity);
       }
       
       }
      diff --git a/util/cache_test.cc b/util/cache_test.cc
      index dbab988..8a7f1c4 100644
      --- a/util/cache_test.cc
      +++ b/util/cache_test.cc
      @@ -32,7 +32,7 @@ class CacheTest {
           current_->deleted_values_.push_back(DecodeValue(v));
         }
       
      -  static const int kCacheSize = 100;
      +  static const int kCacheSize = 1000;
         std::vector deleted_keys_;
         std::vector deleted_values_;
         Cache* cache_;
      @@ -137,23 +137,40 @@ TEST(CacheTest, EvictionPolicy) {
         Insert(200, 201);
       
         // Frequently used entry must be kept around
      -  for (int i = 0; i < kCacheSize; i++) {
      +  for (int i = 0; i < kCacheSize + 100; i++) {
           Insert(1000+i, 2000+i);
           ASSERT_EQ(2000+i, Lookup(1000+i));
           ASSERT_EQ(101, Lookup(100));
         }
         ASSERT_EQ(101, Lookup(100));
      -  ASSERT_EQ(2, deleted_keys_.size());
      -  ASSERT_EQ(200, deleted_keys_[0]);
      -  ASSERT_EQ(201, deleted_values_[0]);
      +  ASSERT_EQ(-1, Lookup(200));
       }
       
      -TEST(CacheTest, HeavyEntry) {
      -  Insert(100, 101);
      -  Insert(200, 201, kCacheSize);
      -  ASSERT_EQ(1, deleted_keys_.size());
      -  ASSERT_EQ(100, deleted_keys_[0]);
      -  ASSERT_EQ(101, deleted_values_[0]);
      +TEST(CacheTest, HeavyEntries) {
      +  // Add a bunch of light and heavy entries and then count the combined
      +  // size of items still in the cache, which must be approximately the
      +  // same as the total capacity.
      +  const int kLight = 1;
      +  const int kHeavy = 10;
      +  int added = 0;
      +  int index = 0;
      +  while (added < 2*kCacheSize) {
      +    const int weight = (index & 1) ? kLight : kHeavy;
      +    Insert(index, 1000+index, weight);
      +    added += weight;
      +    index++;
      +  }
      +
      +  int cached_weight = 0;
      +  for (int i = 0; i < index; i++) {
      +    const int weight = (i & 1 ? kLight : kHeavy);
      +    int r = Lookup(i);
      +    if (r >= 0) {
      +      cached_weight += weight;
      +      ASSERT_EQ(1000+i, r);
      +    }
      +  }
      +  ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10);
       }
       
       TEST(CacheTest, NewId) {
      diff --git a/util/histogram.cc b/util/histogram.cc
      index c5178ef..12ec3cf 100644
      --- a/util/histogram.cc
      +++ b/util/histogram.cc
      @@ -55,6 +55,17 @@ void Histogram::Add(double value) {
         sum_squares_ += (value * value);
       }
       
      +void Histogram::Merge(const Histogram& other) {
      +  if (other.min_ < min_) min_ = other.min_;
      +  if (other.max_ > max_) max_ = other.max_;
      +  num_ += other.num_;
      +  sum_ += other.sum_;
      +  sum_squares_ += other.sum_squares_;
      +  for (int b = 0; b < kNumBuckets; b++) {
      +    buckets_[b] += other.buckets_[b];
      +  }
      +}
      +
       double Histogram::Median() const {
         return Percentile(50.0);
       }
      diff --git a/util/histogram.h b/util/histogram.h
      index f72f122..32484c0 100644
      --- a/util/histogram.h
      +++ b/util/histogram.h
      @@ -16,6 +16,7 @@ class Histogram {
       
         void Clear();
         void Add(double value);
      +  void Merge(const Histogram& other);
       
         std::string ToString() const;
       
      -- 
      cgit v1.2.3
      
      
      From 394a4b425a6a8aca3244fc26ec77c101a11a632c Mon Sep 17 00:00:00 2001
      From: "gabor@google.com"
       
      Date: Thu, 1 Sep 2011 19:08:02 +0000
      Subject: Bugfixes: for Get(), don't hold mutex while writing log.
      
      - Fix bug in Get: when it triggers a compaction, it could sometimes
        mark the compaction with the wrong level (if there was a gap
        in the set of levels examined for the Get).
      
      - Do not hold mutex while writing to the log file or to the
        MANIFEST file.
      
        Added a new benchmark that runs a writer thread concurrently with
        reader threads.
      
        Percentiles
        ------------------------------
        micros/op: avg  median 99   99.9  99.99  99.999 max
        ------------------------------------------------------
        before:    42   38     110  225   32000  42000  48000
        after:     24   20     55   65    130    1100   7000
      
      - Fixed race in optimized Get.  It should have been using the
        pinned memtables, not the current memtables.
      
      
      
      git-svn-id: http://leveldb.googlecode.com/svn/trunk@50 62dab493-f737-651d-591e-8d6aee1b9529
      ---
       db/db_bench.cc    | 45 ++++++++++++++++++++++++++++++++----
       db/db_impl.cc     | 69 ++++++++++++++++++++++++++++++++++++++++++++-----------
       db/db_impl.h      |  7 ++++++
       db/db_test.cc     | 43 ++++++++++++++++++++++++++++++++--
       db/version_set.cc | 46 ++++++++++++++++++++++++++-----------
       db/version_set.h  | 15 ++++++++----
       6 files changed, 186 insertions(+), 39 deletions(-)
      
      diff --git a/db/db_bench.cc b/db/db_bench.cc
      index d3ec61b..bb63e59 100644
      --- a/db/db_bench.cc
      +++ b/db/db_bench.cc
      @@ -280,6 +280,7 @@ struct ThreadState {
         int tid;             // 0..n-1 when running in n threads
         Random rand;         // Has different seeds for different threads
         Stats stats;
      +  SharedState* shared;
       
         ThreadState(int index)
             : tid(index),
      @@ -418,13 +419,14 @@ class Benchmark {
       
             // Reset parameters that may be overriddden bwlow
             num_ = FLAGS_num;
      -      reads_ = num_;
      +      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
             value_size_ = FLAGS_value_size;
             entries_per_batch_ = 1;
             write_options_ = WriteOptions();
       
             void (Benchmark::*method)(ThreadState*) = NULL;
             bool fresh_db = false;
      +      int num_threads = FLAGS_threads;
       
             if (name == Slice("fillseq")) {
               fresh_db = true;
      @@ -460,6 +462,9 @@ class Benchmark {
             } else if (name == Slice("readrandomsmall")) {
               reads_ /= 1000;
               method = &Benchmark::ReadRandom;
      +      } else if (name == Slice("readwhilewriting")) {
      +        num_threads++;  // Add extra thread for writing
      +        method = &Benchmark::ReadWhileWriting;
             } else if (name == Slice("compact")) {
               method = &Benchmark::Compact;
             } else if (name == Slice("crc32c")) {
      @@ -494,7 +499,7 @@ class Benchmark {
             }
       
             if (method != NULL) {
      -        RunBenchmark(name, method);
      +        RunBenchmark(num_threads, name, method);
             }
           }
         }
      @@ -535,8 +540,8 @@ class Benchmark {
           }
         }
       
      -  void RunBenchmark(Slice name, void (Benchmark::*method)(ThreadState*)) {
      -    const int n = FLAGS_threads;
      +  void RunBenchmark(int n, Slice name,
      +                    void (Benchmark::*method)(ThreadState*)) {
           SharedState shared;
           shared.total = n;
           shared.num_initialized = 0;
      @@ -549,6 +554,7 @@ class Benchmark {
             arg[i].method = method;
             arg[i].shared = &shared;
             arg[i].thread = new ThreadState(i);
      +      arg[i].thread->shared = &shared;
             Env::Default()->StartThread(ThreadBody, &arg[i]);
           }
       
      @@ -688,7 +694,6 @@ class Benchmark {
           RandomGenerator gen;
           WriteBatch batch;
           Status s;
      -    std::string val;
           int64_t bytes = 0;
           for (int i = 0; i < num_; i += entries_per_batch_) {
             batch.Clear();
      @@ -760,6 +765,36 @@ class Benchmark {
           }
         }
       
      +  void ReadWhileWriting(ThreadState* thread) {
      +    if (thread->tid > 0) {
      +      ReadRandom(thread);
      +    } else {
      +      // Special thread that keeps writing until other threads are done.
      +      RandomGenerator gen;
      +      while (true) {
      +        {
      +          MutexLock l(&thread->shared->mu);
      +          if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
      +            // Other threads have finished
      +            break;
      +          }
      +        }
      +
      +        const int k = thread->rand.Next() % FLAGS_num;
      +        char key[100];
      +        snprintf(key, sizeof(key), "%016d", k);
      +        Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
      +        if (!s.ok()) {
      +          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
      +          exit(1);
      +        }
      +      }
      +
      +      // Do not count any of the preceding work/delay in stats.
      +      thread->stats.Start();
      +    }
      +  }
      +
         void Compact(ThreadState* thread) {
           DBImpl* dbi = reinterpret_cast(db_);
           dbi->TEST_CompactMemTable();
      diff --git a/db/db_impl.cc b/db/db_impl.cc
      index c4c6a61..0ca6386 100644
      --- a/db/db_impl.cc
      +++ b/db/db_impl.cc
      @@ -113,6 +113,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
             logfile_(NULL),
             logfile_number_(0),
             log_(NULL),
      +      logger_(NULL),
      +      logger_cv_(&mutex_),
             bg_compaction_scheduled_(false),
             manual_compaction_(NULL) {
         mem_->Ref();
      @@ -308,6 +310,11 @@ Status DBImpl::Recover(VersionEdit* edit) {
           std::sort(logs.begin(), logs.end());
           for (size_t i = 0; i < logs.size(); i++) {
             s = RecoverLogFile(logs[i], edit, &max_sequence);
      +
      +      // The previous incarnation may not have written any MANIFEST
      +      // records after allocating this log number.  So we manually
      +      // update the file number allocation counter in VersionSet.
      +      versions_->MarkFileNumberUsed(logs[i]);
           }
       
           if (s.ok()) {
      @@ -485,7 +492,7 @@ Status DBImpl::CompactMemTable() {
         if (s.ok()) {
           edit.SetPrevLogNumber(0);
           edit.SetLogNumber(logfile_number_);  // Earlier logs no longer needed
      -    s = versions_->LogAndApply(&edit);
      +    s = versions_->LogAndApply(&edit, &mutex_);
         }
       
         if (s.ok()) {
      @@ -523,7 +530,10 @@ void DBImpl::TEST_CompactRange(
       
       Status DBImpl::TEST_CompactMemTable() {
         MutexLock l(&mutex_);
      +  LoggerId self;
      +  AcquireLoggingResponsibility(&self);
         Status s = MakeRoomForWrite(true /* force compaction */);
      +  ReleaseLoggingResponsibility(&self);
         if (s.ok()) {
           // Wait until the compaction completes
           while (imm_ != NULL && bg_error_.ok()) {
      @@ -600,7 +610,7 @@ void DBImpl::BackgroundCompaction() {
           c->edit()->DeleteFile(c->level(), f->number);
           c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
                              f->smallest, f->largest);
      -    status = versions_->LogAndApply(c->edit());
      +    status = versions_->LogAndApply(c->edit(), &mutex_);
           VersionSet::LevelSummaryStorage tmp;
           Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
               static_cast(f->number),
      @@ -748,7 +758,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
         }
         compact->outputs.clear();
       
      -  Status s = versions_->LogAndApply(compact->compaction->edit());
      +  Status s = versions_->LogAndApply(compact->compaction->edit(), &mutex_);
         if (s.ok()) {
           compact->compaction->ReleaseInputs();
           DeleteObsoleteFiles();
      @@ -1004,9 +1014,9 @@ Status DBImpl::Get(const ReadOptions& options,
           mutex_.Unlock();
           // First look in the memtable, then in the immutable memtable (if any).
           LookupKey lkey(key, snapshot);
      -    if (mem_->Get(lkey, value, &s)) {
      +    if (mem->Get(lkey, value, &s)) {
             // Done
      -    } else if (imm_ != NULL && imm_->Get(lkey, value, &s)) {
      +    } else if (imm != NULL && imm->Get(lkey, value, &s)) {
             // Done
           } else {
             s = current->Get(options, lkey, value, &stats);
      @@ -1053,34 +1063,65 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
         return DB::Delete(options, key);
       }
       
      +// There is at most one thread that is the current logger.  This call
      +// waits until preceding logger(s) have finished and becomes the
      +// current logger.
      +void DBImpl::AcquireLoggingResponsibility(LoggerId* self) {
      +  while (logger_ != NULL) {
      +    logger_cv_.Wait();
      +  }
      +  logger_ = self;
      +}
      +
      +void DBImpl::ReleaseLoggingResponsibility(LoggerId* self) {
      +  assert(logger_ == self);
      +  logger_ = NULL;
      +  logger_cv_.SignalAll();
      +}
      +
       Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
         Status status;
         MutexLock l(&mutex_);
      +  LoggerId self;
      +  AcquireLoggingResponsibility(&self);
         status = MakeRoomForWrite(false);  // May temporarily release lock and wait
         uint64_t last_sequence = versions_->LastSequence();
         if (status.ok()) {
           WriteBatchInternal::SetSequence(updates, last_sequence + 1);
           last_sequence += WriteBatchInternal::Count(updates);
      -    versions_->SetLastSequence(last_sequence);
       
      -    // Add to log and apply to memtable
      -    status = log_->AddRecord(WriteBatchInternal::Contents(updates));
      -    if (status.ok() && options.sync) {
      -      status = logfile_->Sync();
      -    }
      -    if (status.ok()) {
      -      status = WriteBatchInternal::InsertInto(updates, mem_);
      +    // Add to log and apply to memtable.  We can release the lock during
      +    // this phase since the "logger_" flag protects against concurrent
      +    // loggers and concurrent writes into mem_.
      +    {
      +      assert(logger_ == &self);
      +      mutex_.Unlock();
      +      status = log_->AddRecord(WriteBatchInternal::Contents(updates));
      +      if (status.ok() && options.sync) {
      +        status = logfile_->Sync();
      +      }
      +      if (status.ok()) {
      +        status = WriteBatchInternal::InsertInto(updates, mem_);
      +      }
      +      mutex_.Lock();
      +      assert(logger_ == &self);
           }
      +
      +    versions_->SetLastSequence(last_sequence);
         }
         if (options.post_write_snapshot != NULL) {
           *options.post_write_snapshot =
               status.ok() ? snapshots_.New(last_sequence) : NULL;
         }
      +  ReleaseLoggingResponsibility(&self);
         return status;
       }
       
      +// REQUIRES: mutex_ is held
      +// REQUIRES: this thread is the current logger
       Status DBImpl::MakeRoomForWrite(bool force) {
         mutex_.AssertHeld();
      +  assert(logger_ != NULL);
         bool allow_delay = !force;
         Status s;
         while (true) {
      @@ -1249,7 +1290,7 @@ Status DB::Open(const Options& options, const std::string& dbname,
             impl->logfile_ = lfile;
             impl->logfile_number_ = new_log_number;
             impl->log_ = new log::Writer(lfile);
      -      s = impl->versions_->LogAndApply(&edit);
      +      s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
           }
           if (s.ok()) {
             impl->DeleteObsoleteFiles();
      diff --git a/db/db_impl.h b/db/db_impl.h
      index f11ea55..5268137 100644
      --- a/db/db_impl.h
      +++ b/db/db_impl.h
      @@ -87,6 +87,11 @@ class DBImpl : public DB {
       
         Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base);
       
      +  // Only thread is allowed to log at a time.
      +  struct LoggerId { };          // Opaque identifier for logging thread
      +  void AcquireLoggingResponsibility(LoggerId* self);
      +  void ReleaseLoggingResponsibility(LoggerId* self);
      +
         Status MakeRoomForWrite(bool force /* compact even if there is room? */);
       
         struct CompactionState;
      @@ -126,6 +131,8 @@ class DBImpl : public DB {
         WritableFile* logfile_;
         uint64_t logfile_number_;
         log::Writer* log_;
      +  LoggerId* logger_;            // NULL, or the id of the current logging thread
      +  port::CondVar logger_cv_;     // For threads waiting to log
         SnapshotList snapshots_;
       
         // Set of table files to protect from deletion because they are
      diff --git a/db/db_test.cc b/db/db_test.cc
      index 14eb44d..daa9c03 100644
      --- a/db/db_test.cc
      +++ b/db/db_test.cc
      @@ -10,6 +10,7 @@
       #include "leveldb/env.h"
       #include "leveldb/table.h"
       #include "util/logging.h"
      +#include "util/mutexlock.h"
       #include "util/testharness.h"
       #include "util/testutil.h"
       
      @@ -345,6 +346,41 @@ TEST(DBTest, GetPicksCorrectFile) {
         ASSERT_EQ("vx", Get("x"));
       }
       
      +TEST(DBTest, GetEncountersEmptyLevel) {
      +  // Arrange for the following to happen:
      +  //   * sstable A in level 0
      +  //   * nothing in level 1
      +  //   * sstable B in level 2
      +  // Then do enough Get() calls to arrange for an automatic compaction
      +  // of sstable A.  A bug would cause the compaction to be marked as
      +  // occuring at level 1 (instead of the correct level 0).
      +
      +  // Step 1: First place sstables in levels 0 and 2
      +  int compaction_count = 0;
      +  while (NumTableFilesAtLevel(0) == 0 ||
      +         NumTableFilesAtLevel(2) == 0) {
      +    ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
      +    compaction_count++;
      +    Put("a", "begin");
      +    Put("z", "end");
      +    dbfull()->TEST_CompactMemTable();
      +  }
      +
      +  // Step 2: clear level 1 if necessary.
      +  dbfull()->TEST_CompactRange(1, "a", "z");
      +  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
      +  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
      +  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
      +
      +  // Step 3: read until level 0 compaction disappears.
      +  int read_count = 0;
      +  while (NumTableFilesAtLevel(0) > 0) {
      +    ASSERT_LE(read_count, 10000) << "did not trigger level 0 compaction";
      +    read_count++;
      +    ASSERT_EQ("NOT_FOUND", Get("missing"));
      +  }
      +}
      +
       TEST(DBTest, IterEmpty) {
         Iterator* iter = db_->NewIterator(ReadOptions());
       
      @@ -1355,6 +1391,9 @@ void BM_LogAndApply(int iters, int num_base_files) {
       
         Env* env = Env::Default();
       
      +  port::Mutex mu;
      +  MutexLock l(&mu);
      +
         InternalKeyComparator cmp(BytewiseComparator());
         Options options;
         VersionSet vset(dbname, &options, NULL, &cmp);
      @@ -1366,7 +1405,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
           InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
           vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
         }
      -  ASSERT_OK(vset.LogAndApply(&vbase));
      +  ASSERT_OK(vset.LogAndApply(&vbase, &mu));
       
         uint64_t start_micros = env->NowMicros();
       
      @@ -1376,7 +1415,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
           InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
           InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
           vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
      -    vset.LogAndApply(&vedit);
      +    vset.LogAndApply(&vedit, &mu);
         }
         uint64_t stop_micros = env->NowMicros();
         unsigned int us = stop_micros - start_micros;
      diff --git a/db/version_set.cc b/db/version_set.cc
      index aace624..d75b347 100644
      --- a/db/version_set.cc
      +++ b/db/version_set.cc
      @@ -250,6 +250,7 @@ Status Version::Get(const ReadOptions& options,
         stats->seek_file = NULL;
         stats->seek_file_level = -1;
         FileMetaData* last_file_read = NULL;
      +  int last_file_read_level = -1;
       
         // We can search level-by-level since entries never hop across
         // levels.  Therefore we are guaranteed that if we find data
      @@ -301,11 +302,12 @@ Status Version::Get(const ReadOptions& options,
             if (last_file_read != NULL && stats->seek_file == NULL) {
               // We have had more than one seek for this read.  Charge the 1st file.
               stats->seek_file = last_file_read;
      -        stats->seek_file_level = (i == 0 ? level - 1 : level);
      +        stats->seek_file_level = last_file_read_level;
             }
       
             FileMetaData* f = files[i];
             last_file_read = f;
      +      last_file_read_level = level;
       
             Iterator* iter = vset_->table_cache_->NewIterator(
                 options,
      @@ -609,7 +611,7 @@ void VersionSet::AppendVersion(Version* v) {
         v->next_->prev_ = v;
       }
       
      -Status VersionSet::LogAndApply(VersionEdit* edit) {
      +Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
         if (edit->has_log_number_) {
           assert(edit->log_number_ >= log_number_);
           assert(edit->log_number_ < next_file_number_);
      @@ -637,6 +639,8 @@ Status VersionSet::LogAndApply(VersionEdit* edit) {
         std::string new_manifest_file;
         Status s;
         if (descriptor_log_ == NULL) {
      +    // No reason to unlock *mu here since we only hit this path in the
      +    // first call to LogAndApply (when opening the database).
           assert(descriptor_file_ == NULL);
           new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
           edit->SetNextFile(next_file_number_);
      @@ -647,20 +651,27 @@ Status VersionSet::LogAndApply(VersionEdit* edit) {
           }
         }
       
      -  // Write new record to MANIFEST log
      -  if (s.ok()) {
      -    std::string record;
      -    edit->EncodeTo(&record);
      -    s = descriptor_log_->AddRecord(record);
      +  // Unlock during expensive MANIFEST log write
      +  {
      +    mu->Unlock();
      +
      +    // Write new record to MANIFEST log
           if (s.ok()) {
      -      s = descriptor_file_->Sync();
      +      std::string record;
      +      edit->EncodeTo(&record);
      +      s = descriptor_log_->AddRecord(record);
      +      if (s.ok()) {
      +        s = descriptor_file_->Sync();
      +      }
           }
      -  }
       
      -  // If we just created a new descriptor file, install it by writing a
      -  // new CURRENT file that points to it.
      -  if (s.ok() && !new_manifest_file.empty()) {
      -    s = SetCurrentFile(env_, dbname_, manifest_file_number_);
      +    // If we just created a new descriptor file, install it by writing a
      +    // new CURRENT file that points to it.
      +    if (s.ok() && !new_manifest_file.empty()) {
      +      s = SetCurrentFile(env_, dbname_, manifest_file_number_);
      +    }
      +
      +    mu->Lock();
         }
       
         // Install the new version
      @@ -776,6 +787,9 @@ Status VersionSet::Recover() {
           if (!have_prev_log_number) {
             prev_log_number = 0;
           }
      +
      +    MarkFileNumberUsed(prev_log_number);
      +    MarkFileNumberUsed(log_number);
         }
       
         if (s.ok()) {
      @@ -794,6 +808,12 @@ Status VersionSet::Recover() {
         return s;
       }
       
      +void VersionSet::MarkFileNumberUsed(uint64_t number) {
      +  if (next_file_number_ <= number) {
      +    next_file_number_ = number + 1;
      +  }
      +}
      +
       static int64_t TotalFileSize(const std::vector& files) {
         int64_t sum = 0;
         for (size_t i = 0; i < files.size(); i++) {
      diff --git a/db/version_set.h b/db/version_set.h
      index 693fc6f..2dbd948 100644
      --- a/db/version_set.h
      +++ b/db/version_set.h
      @@ -138,15 +138,14 @@ class VersionSet {
       
         // Apply *edit to the current version to form a new descriptor that
         // is both saved to persistent state and installed as the new
      -  // current version.
      -  Status LogAndApply(VersionEdit* edit);
      +  // current version.  Will release *mu while actually writing to the file.
      +  // REQUIRES: *mu is held on entry.
      +  // REQUIRES: no other thread concurrently calls LogAndApply()
      +  Status LogAndApply(VersionEdit* edit, port::Mutex* mu);
       
         // Recover the last saved descriptor from persistent storage.
         Status Recover();
       
      -  // Save current contents to *log
      -  Status WriteSnapshot(log::Writer* log);
      -
         // Return the current version.
         Version* current() const { return current_; }
       
      @@ -171,6 +170,9 @@ class VersionSet {
           last_sequence_ = s;
         }
       
      +  // Mark the specified file number as used.
      +  void MarkFileNumberUsed(uint64_t number);
      +
         // Return the current log file number.
         uint64_t LogNumber() const { return log_number_; }
       
      @@ -247,6 +249,9 @@ class VersionSet {
       
         void SetupOtherInputs(Compaction* c);
       
      +  // Save current contents to *log
      +  Status WriteSnapshot(log::Writer* log);
      +
         void AppendVersion(Version* v);
       
         Env* const env_;
      -- 
      cgit v1.2.3
      
      
      From a7d6c3178930c1ebb77a45a7378b9251d707912a Mon Sep 17 00:00:00 2001
      From: "gabor@google.com"
       
      Date: Thu, 15 Sep 2011 21:01:32 +0000
      Subject: Sync with upstream @23860137.
      
      Fix GCC -Wshadow warnings in LevelDB's public header files,
      reported by Dustin.
      
      Add in-memory Env implementation (helpers/memenv/*).
      This enables users to create LevelDB databases in-memory.
      
      Initialize ShardedLRUCache::last_id_ to zero.
      This fixes a Valgrind warning.
      
      (Also delete port/sha1_* which were removed upstream some time ago.)
      
      
      
      git-svn-id: http://leveldb.googlecode.com/svn/trunk@51 62dab493-f737-651d-591e-8d6aee1b9529
      ---
       Makefile                      |  12 +-
       helpers/memenv/memenv.cc      | 374 ++++++++++++++++++++++++++++++++++++++++++
       helpers/memenv/memenv.h       |  20 +++
       helpers/memenv/memenv_test.cc | 232 ++++++++++++++++++++++++++
       include/leveldb/env.h         |   4 +-
       include/leveldb/slice.h       |   4 +-
       util/cache.cc                 |   3 +-
       7 files changed, 642 insertions(+), 7 deletions(-)
       create mode 100644 helpers/memenv/memenv.cc
       create mode 100644 helpers/memenv/memenv.h
       create mode 100644 helpers/memenv/memenv_test.cc
      
      diff --git a/Makefile b/Makefile
      index 86e24e8..18386f2 100644
      --- a/Makefile
      +++ b/Makefile
      @@ -92,6 +92,7 @@ TESTS = \
       	env_test \
       	filename_test \
       	log_test \
      +	memenv_test \
       	skiplist_test \
       	table_test \
       	version_edit_test \
      @@ -102,6 +103,7 @@ PROGRAMS = db_bench $(TESTS)
       BENCHMARKS = db_bench_sqlite3 db_bench_tree_db
       
       LIBRARY = libleveldb.a
      +MEMENVLIBRARY = libmemenv.a
       
       all: $(LIBRARY)
       
      @@ -109,7 +111,7 @@ check: $(PROGRAMS) $(TESTS)
       	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
       
       clean:
      -	-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) */*.o */*/*.o ios-x86/*/*.o ios-arm/*/*.o
      +	-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(MEMENVLIBRARY) */*.o */*/*.o ios-x86/*/*.o ios-arm/*/*.o
       	-rm -rf ios-x86/* ios-arm/*
       	-rm build_config.mk
       
      @@ -174,6 +176,13 @@ version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
       write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
       	$(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
       
      +$(MEMENVLIBRARY) : helpers/memenv/memenv.o
      +	rm -f $@
      +	$(AR) -rs $@ helpers/memenv/memenv.o
      +
      +memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
      +	$(CC) $(LDFLAGS) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@
      +
       ifeq ($(PLATFORM), IOS)
       # For iOS, create universal object files to be used on both the simulator and
       # a device.
      @@ -202,4 +211,3 @@ else
       .c.o:
       	$(CC) $(CFLAGS) $< -o $@
       endif
      -
      diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc
      new file mode 100644
      index 0000000..dab80fe
      --- /dev/null
      +++ b/helpers/memenv/memenv.cc
      @@ -0,0 +1,374 @@
      +// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
      +// Use of this source code is governed by a BSD-style license that can be
      +// found in the LICENSE file. See the AUTHORS file for names of contributors.
      +
      +#include "helpers/memenv/memenv.h"
      +
      +#include "leveldb/env.h"
      +#include "leveldb/status.h"
      +#include "port/port.h"
      +#include "util/mutexlock.h"
      +#include 
      +#include 
      +#include 
      +#include 
      +
      +namespace leveldb {
      +
      +namespace {
      +
      +class FileState {
      + public:
      +  // FileStates are reference counted. The initial reference count is zero
      +  // and the caller must call Ref() at least once.
      +  FileState() : refs_(0), size_(0) {}
      +
      +  // Increase the reference count.
      +  void Ref() {
      +    MutexLock lock(&refs_mutex_);
      +    ++refs_;
      +  }
      +
      +  // Decrease the reference count. Delete if this is the last reference.
      +  void Unref() {
      +    bool do_delete = false;
      +
      +    {
      +      MutexLock lock(&refs_mutex_);
      +      --refs_;
      +      assert(refs_ >= 0);
      +      if (refs_ <= 0) {
      +        do_delete = true;
      +      }
      +    }
      +
      +    if (do_delete) {
      +      delete this;
      +    }
      +  }
      +
      +  uint64_t Size() const { return size_; }
      +
      +  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
      +    if (offset > size_) {
      +      return Status::IOError("Offset greater than file size.");
      +    }
      +    const uint64_t available = size_ - offset;
      +    if (n > available) {
      +      n = available;
      +    }
      +    if (n == 0) {
      +      *result = Slice();
      +      return Status::OK();
      +    }
      +
      +    size_t block = offset / kBlockSize;
      +    size_t block_offset = offset % kBlockSize;
      +
      +    if (n <= kBlockSize - block_offset) {
      +      // The requested bytes are all in the first block.
      +      *result = Slice(blocks_[block] + block_offset, n);
      +      return Status::OK();
      +    }
      +
      +    size_t bytes_to_copy = n;
      +    char* dst = scratch;
      +
      +    while (bytes_to_copy > 0) {
      +      size_t avail = kBlockSize - block_offset;
      +      if (avail > bytes_to_copy) {
      +        avail = bytes_to_copy;
      +      }
      +      memcpy(dst, blocks_[block] + block_offset, avail);
      +
      +      bytes_to_copy -= avail;
      +      dst += avail;
      +      block++;
      +      block_offset = 0;
      +    }
      +
      +    *result = Slice(scratch, n);
      +    return Status::OK();
      +  }
      +
      +  Status Append(const Slice& data) {
      +    const char* src = data.data();
      +    size_t src_len = data.size();
      +
      +    while (src_len > 0) {
      +      size_t avail;
      +      size_t offset = size_ % kBlockSize;
      +
      +      if (offset != 0) {
      +        // There is some room in the last block.
      +        avail = kBlockSize - offset;
      +      } else {
      +        // No room in the last block; push new one.
      +        blocks_.push_back(new char[kBlockSize]);
      +        avail = kBlockSize;
      +      }
      +
      +      if (avail > src_len) {
      +        avail = src_len;
      +      }
      +      memcpy(blocks_.back() + offset, src, avail);
      +      src_len -= avail;
      +      src += avail;
      +      size_ += avail;
      +    }
      +
      +    return Status::OK();
      +  }
      +
      + private:
      +  // Private since only Unref() should be used to delete it.
      +  ~FileState() {
      +    for (std::vector::iterator i = blocks_.begin(); i != blocks_.end();
      +         ++i) {
      +      delete [] *i;
      +    }
      +  }
      +
      +  // No copying allowed.
      +  FileState(const FileState&);
      +  void operator=(const FileState&);
      +
      +  port::Mutex refs_mutex_;
      +  int refs_;  // Protected by refs_mutex_;
      +
      +  // The following fields are not protected by any mutex. They are only mutable
      +  // while the file is being written, and concurrent access is not allowed
      +  // to writable files.
      +  std::vector blocks_;
      +  uint64_t size_;
      +
      +  enum { kBlockSize = 8 * 1024 };
      +};
      +
      +class SequentialFileImpl : public SequentialFile {
      + public:
      +  explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
      +    file_->Ref();
      +  }
      +
      +  ~SequentialFileImpl() {
      +    file_->Unref();
      +  }
      +
      +  virtual Status Read(size_t n, Slice* result, char* scratch) {
      +    Status s = file_->Read(pos_, n, result, scratch);
      +    if (s.ok()) {
      +      pos_ += result->size();
      +    }
      +    return s;
      +  }
      +
      +  virtual Status Skip(uint64_t n) {
      +    if (pos_ > file_->Size()) {
      +      return Status::IOError("pos_ > file_->Size()");
      +    }
      +    const size_t available = file_->Size() - pos_;
      +    if (n > available) {
      +      n = available;
      +    }
      +    pos_ += n;
      +    return Status::OK();
      +  }
      +
      + private:
      +  FileState* file_;
      +  size_t pos_;
      +};
      +
      +class RandomAccessFileImpl : public RandomAccessFile {
      + public:
      +  explicit RandomAccessFileImpl(FileState* file) : file_(file) {
      +    file_->Ref();
      +  }
      +
      +  ~RandomAccessFileImpl() {
      +    file_->Unref();
      +  }
      +
      +  virtual Status Read(uint64_t offset, size_t n, Slice* result,
      +                      char* scratch) const {
      +    return file_->Read(offset, n, result, scratch);
      +  }
      +
      + private:
      +  FileState* file_;
      +};
      +
      +class WritableFileImpl : public WritableFile {
      + public:
      +  WritableFileImpl(FileState* file) : file_(file) {
      +    file_->Ref();
      +  }
      +
      +  ~WritableFileImpl() {
      +    file_->Unref();
      +  }
      +
      +  virtual Status Append(const Slice& data) {
      +    return file_->Append(data);
      +  }
      +
      +  virtual Status Close() { return Status::OK(); }
      +  virtual Status Flush() { return Status::OK(); }
      +  virtual Status Sync() { return Status::OK(); }
      +
      + private:
      +  FileState* file_;
      +};
      +
      +class InMemoryEnv : public EnvWrapper {
      + public:
      +  explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
      +
      +  virtual ~InMemoryEnv() {
      +    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
      +      i->second->Unref();
      +    }
      +  }
      +
      +  // Partial implementation of the Env interface.
      +  virtual Status NewSequentialFile(const std::string& fname,
      +                                   SequentialFile** result) {
      +    MutexLock lock(&mutex_);
      +    if (file_map_.find(fname) == file_map_.end()) {
      +      *result = NULL;
      +      return Status::IOError(fname, "File not found");
      +    }
      +
      +    *result = new SequentialFileImpl(file_map_[fname]);
      +    return Status::OK();
      +  }
      +
      +  virtual Status NewRandomAccessFile(const std::string& fname,
      +                                     RandomAccessFile** result) {
      +    MutexLock lock(&mutex_);
      +    if (file_map_.find(fname) == file_map_.end()) {
      +      *result = NULL;
      +      return Status::IOError(fname, "File not found");
      +    }
      +
      +    *result = new RandomAccessFileImpl(file_map_[fname]);
      +    return Status::OK();
      +  }
      +
      +  virtual Status NewWritableFile(const std::string& fname,
      +                                 WritableFile** result) {
      +    MutexLock lock(&mutex_);
      +    if (file_map_.find(fname) != file_map_.end()) {
      +      DeleteFileInternal(fname);
      +    }
      +
      +    FileState* file = new FileState();
      +    file->Ref();
      +    file_map_[fname] = file;
      +
      +    *result = new WritableFileImpl(file);
      +    return Status::OK();
      +  }
      +
      +  virtual bool FileExists(const std::string& fname) {
      +    MutexLock lock(&mutex_);
      +    return file_map_.find(fname) != file_map_.end();
      +  }
      +
      +  virtual Status GetChildren(const std::string& dir,
      +                             std::vector* result) {
      +    MutexLock lock(&mutex_);
      +    result->clear();
      +
      +    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
      +      const std::string& filename = i->first;
      +
      +      if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
      +          Slice(filename).starts_with(Slice(dir))) {
      +        result->push_back(filename.substr(dir.size() + 1));
      +      }
      +    }
      +
      +    return Status::OK();
      +  }
      +
      +  void DeleteFileInternal(const std::string& fname) {
      +    if (file_map_.find(fname) == file_map_.end()) {
      +      return;
      +    }
      +
      +    file_map_[fname]->Unref();
      +    file_map_.erase(fname);
      +  }
      +
      +  virtual Status DeleteFile(const std::string& fname) {
      +    MutexLock lock(&mutex_);
      +    if (file_map_.find(fname) == file_map_.end()) {
      +      return Status::IOError(fname, "File not found");
      +    }
      +
      +    DeleteFileInternal(fname);
      +    return Status::OK();
      +  }
      +
      +  virtual Status CreateDir(const std::string& dirname) {
      +    return Status::OK();
      +  }
      +
      +  virtual Status DeleteDir(const std::string& dirname) {
      +    return Status::OK();
      +  }
      +
      +  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
      +    MutexLock lock(&mutex_);
      +    if (file_map_.find(fname) == file_map_.end()) {
      +      return Status::IOError(fname, "File not found");
      +    }
      +
      +    *file_size = file_map_[fname]->Size();
      +    return Status::OK();
      +  }
      +
      +  virtual Status RenameFile(const std::string& src,
      +                            const std::string& target) {
      +    MutexLock lock(&mutex_);
      +    if (file_map_.find(src) == file_map_.end()) {
      +      return Status::IOError(src, "File not found");
      +    }
      +
      +    DeleteFileInternal(target);
      +    file_map_[target] = file_map_[src];
      +    file_map_.erase(src);
      +    return Status::OK();
      +  }
      +
      +  virtual Status LockFile(const std::string& fname, FileLock** lock) {
      +    *lock = new FileLock;
      +    return Status::OK();
      +  }
      +
      +  virtual Status UnlockFile(FileLock* lock) {
      +    delete lock;
      +    return Status::OK();
      +  }
      +
      +  virtual Status GetTestDirectory(std::string* path) {
      +    *path = "/test";
      +    return Status::OK();
      +  }
      +
      + private:
      +  // Map from filenames to FileState objects, representing a simple file system.
      +  typedef std::map FileSystem;
      +  port::Mutex mutex_;
      +  FileSystem file_map_;  // Protected by mutex_.
      +};
      +
      +}
      +
      +Env* NewMemEnv(Env* base_env) {
      +  return new InMemoryEnv(base_env);
      +}
      +
      +}
      diff --git a/helpers/memenv/memenv.h b/helpers/memenv/memenv.h
      new file mode 100644
      index 0000000..835b944
      --- /dev/null
      +++ b/helpers/memenv/memenv.h
      @@ -0,0 +1,20 @@
      +// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
      +// Use of this source code is governed by a BSD-style license that can be
      +// found in the LICENSE file. See the AUTHORS file for names of contributors.
      +
      +#ifndef STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_
      +#define STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_
      +
      +namespace leveldb {
      +
      +class Env;
      +
      +// Returns a new environment that stores its data in memory and delegates
      +// all non-file-storage tasks to base_env. The caller must delete the result
      +// when it is no longer needed.
      +// *base_env must remain live while the result is in use.
      +Env* NewMemEnv(Env* base_env);
      +
      +}
      +
      +#endif  // STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_
      diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc
      new file mode 100644
      index 0000000..30b0bb0
      --- /dev/null
      +++ b/helpers/memenv/memenv_test.cc
      @@ -0,0 +1,232 @@
      +// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
      +// Use of this source code is governed by a BSD-style license that can be
      +// found in the LICENSE file. See the AUTHORS file for names of contributors.
      +
      +#include "helpers/memenv/memenv.h"
      +
      +#include "db/db_impl.h"
      +#include "leveldb/db.h"
      +#include "leveldb/env.h"
      +#include "util/testharness.h"
      +#include 
      +#include 
      +
      +namespace leveldb {
      +
      +class MemEnvTest {
      + public:
      +  Env* env_;
      +
      +  MemEnvTest()
      +      : env_(NewMemEnv(Env::Default())) {
      +  }
      +  ~MemEnvTest() {
      +    delete env_;
      +  }
      +};
      +
      +TEST(MemEnvTest, Basics) {
      +  size_t file_size;
      +  WritableFile* writable_file;
      +  std::vector children;
      +
      +  ASSERT_OK(env_->CreateDir("/dir"));
      +
      +  // Check that the directory is empty.
      +  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
      +  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
      +  ASSERT_OK(env_->GetChildren("/dir", &children));
      +  ASSERT_EQ(0, children.size());
      +
      +  // Create a file.
      +  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
      +  delete writable_file;
      +
      +  // Check that the file exists.
      +  ASSERT_TRUE(env_->FileExists("/dir/f"));
      +  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
      +  ASSERT_EQ(0, file_size);
      +  ASSERT_OK(env_->GetChildren("/dir", &children));
      +  ASSERT_EQ(1, children.size());
      +  ASSERT_EQ("f", children[0]);
      +
      +  // Write to the file.
      +  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
      +  ASSERT_OK(writable_file->Append("abc"));
      +  delete writable_file;
      +
      +  // Check for expected size.
      +  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
      +  ASSERT_EQ(3, file_size);
      +
      +  // Check that renaming works.
      +  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
      +  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
      +  ASSERT_TRUE(!env_->FileExists("/dir/f"));
      +  ASSERT_TRUE(env_->FileExists("/dir/g"));
      +  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
      +  ASSERT_EQ(3, file_size);
      +
      +  // Check that opening non-existent file fails.
      +  SequentialFile* seq_file;
      +  RandomAccessFile* rand_file;
      +  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok());
      +  ASSERT_TRUE(!seq_file);
      +  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok());
      +  ASSERT_TRUE(!rand_file);
      +
      +  // Check that deleting works.
      +  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
      +  ASSERT_OK(env_->DeleteFile("/dir/g"));
      +  ASSERT_TRUE(!env_->FileExists("/dir/g"));
      +  ASSERT_OK(env_->GetChildren("/dir", &children));
      +  ASSERT_EQ(0, children.size());
      +  ASSERT_OK(env_->DeleteDir("/dir"));
      +}
      +
      +TEST(MemEnvTest, ReadWrite) {
      +  WritableFile* writable_file;
      +  SequentialFile* seq_file;
      +  RandomAccessFile* rand_file;
      +  Slice result;
      +  char scratch[100];
      +
      +  ASSERT_OK(env_->CreateDir("/dir"));
      +
      +  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
      +  ASSERT_OK(writable_file->Append("hello "));
      +  ASSERT_OK(writable_file->Append("world"));
      +  delete writable_file;
      +
      +  // Read sequentially.
      +  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
      +  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
      +  ASSERT_EQ(0, result.compare("hello"));
      +  ASSERT_OK(seq_file->Skip(1));
      +  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
      +  ASSERT_EQ(0, result.compare("world"));
      +  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
      +  ASSERT_EQ(0, result.size());
      +  ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
      +  ASSERT_OK(seq_file->Read(1000, &result, scratch));
      +  ASSERT_EQ(0, result.size());
      +  delete seq_file;
      +
      +  // Random reads.
      +  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file));
      +  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
      +  ASSERT_EQ(0, result.compare("world"));
      +  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
      +  ASSERT_EQ(0, result.compare("hello"));
      +  ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
      +  ASSERT_EQ(0, result.compare("d"));
      +
      +  // Too high offset.
      +  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
      +  delete rand_file;
      +}
      +
      +TEST(MemEnvTest, Locks) {
      +  FileLock* lock;
      +
      +  // These are no-ops, but we test they return success.
      +  ASSERT_OK(env_->LockFile("some file", &lock));
      +  ASSERT_OK(env_->UnlockFile(lock));
      +}
      +
      +TEST(MemEnvTest, Misc) {
      +  std::string test_dir;
      +  ASSERT_OK(env_->GetTestDirectory(&test_dir));
      +  ASSERT_TRUE(!test_dir.empty());
      +
      +  WritableFile* writable_file;
      +  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file));
      +
      +  // These are no-ops, but we test they return success.
      +  ASSERT_OK(writable_file->Sync());
      +  ASSERT_OK(writable_file->Flush());
      +  ASSERT_OK(writable_file->Close());
      +  delete writable_file;
      +}
      +
      +TEST(MemEnvTest, LargeWrite) {
      +  const size_t kWriteSize = 300 * 1024;
      +  char* scratch = new char[kWriteSize * 2];
      +
      +  std::string write_data;
      +  for (size_t i = 0; i < kWriteSize; ++i) {
      +    write_data.append(1, static_cast(i));
      +  }
      +
      +  WritableFile* writable_file;
      +  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
      +  ASSERT_OK(writable_file->Append("foo"));
      +  ASSERT_OK(writable_file->Append(write_data));
      +  delete writable_file;
      +
      +  SequentialFile* seq_file;
      +  Slice result;
      +  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
      +  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
      +  ASSERT_EQ(0, result.compare("foo"));
      +
      +  size_t read = 0;
      +  std::string read_data;
      +  while (read < kWriteSize) {
      +    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
      +    read_data.append(result.data(), result.size());
      +    read += result.size();
      +  }
      +  ASSERT_TRUE(write_data == read_data);
      +  delete seq_file;
      +  delete [] scratch;
      +}
      +
      +TEST(MemEnvTest, DBTest) {
      +  Options options;
      +  options.create_if_missing = true;
      +  options.env = env_;
      +  DB* db;
      +
      +  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
      +  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
      +
      +  ASSERT_OK(DB::Open(options, "/dir/db", &db));
      +  for (size_t i = 0; i < 3; ++i) {
      +    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
      +  }
      +
      +  for (size_t i = 0; i < 3; ++i) {
      +    std::string res;
      +    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
      +    ASSERT_TRUE(res == vals[i]);
      +  }
      +
      +  Iterator* iterator = db->NewIterator(ReadOptions());
      +  iterator->SeekToFirst();
      +  for (size_t i = 0; i < 3; ++i) {
      +    ASSERT_TRUE(iterator->Valid());
      +    ASSERT_TRUE(keys[i] == iterator->key());
      +    ASSERT_TRUE(vals[i] == iterator->value());
      +    iterator->Next();
      +  }
      +  ASSERT_TRUE(!iterator->Valid());
      +  delete iterator;
      +
      +  DBImpl* dbi = reinterpret_cast(db);
      +  ASSERT_OK(dbi->TEST_CompactMemTable());
      +
      +  for (size_t i = 0; i < 3; ++i) {
      +    std::string res;
      +    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
      +    ASSERT_TRUE(res == vals[i]);
      +  }
      +
      +  delete db;
      +}
      +
      +}
      +
      +int main(int argc, char** argv) {
      +  return leveldb::test::RunAllTests();
      +}
      diff --git a/include/leveldb/env.h b/include/leveldb/env.h
      index bf51008..1a8ff6b 100644
      --- a/include/leveldb/env.h
      +++ b/include/leveldb/env.h
      @@ -258,8 +258,8 @@ extern Status ReadFileToString(Env* env, const std::string& fname,
       // functionality of another Env.
       class EnvWrapper : public Env {
        public:
      -  // Initialize an EnvWrapper that delegates all calls to *target
      -  explicit EnvWrapper(Env* target) : target_(target) { }
      +  // Initialize an EnvWrapper that delegates all calls to *t
      +  explicit EnvWrapper(Env* t) : target_(t) { }
         virtual ~EnvWrapper();
       
         // Return the target to which this Env forwards all calls
      diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h
      index 3c000b8..7c66d1b 100644
      --- a/include/leveldb/slice.h
      +++ b/include/leveldb/slice.h
      @@ -27,8 +27,8 @@ class Slice {
         // Create an empty slice.
         Slice() : data_(""), size_(0) { }
       
      -  // Create a slice that refers to data[0,n-1].
      -  Slice(const char* data, size_t n) : data_(data), size_(n) { }
      +  // Create a slice that refers to d[0,n-1].
      +  Slice(const char* d, size_t n) : data_(d), size_(n) { }
       
         // Create a slice that refers to the contents of "s"
         Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
      diff --git a/util/cache.cc b/util/cache.cc
      index ce99f08..7d1ebc1 100644
      --- a/util/cache.cc
      +++ b/util/cache.cc
      @@ -285,7 +285,8 @@ class ShardedLRUCache : public Cache {
         }
       
        public:
      -  explicit ShardedLRUCache(size_t capacity) {
      +  explicit ShardedLRUCache(size_t capacity)
      +      : last_id_(0) {
           const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards;
           for (int s = 0; s < kNumShards; s++) {
             shard_[s].SetCapacity(per_shard);
      -- 
      cgit v1.2.3
      
      
      From 72c3721b1551324c540447079dfc3ac65c6cb236 Mon Sep 17 00:00:00 2001
      From: "hans@chromium.org"
       
      Date: Mon, 26 Sep 2011 16:46:40 +0000
      Subject: Sync with upstream @24213649.
      
      Adding GNU/kFreeBSD support. As requested here:
      http://code.google.com/p/leveldb/issues/detail?id=38
      
      Use uint64_t instead of size_t in MemEnvTest. As pointed out at
      http://code.google.com/p/leveldb/issues/detail?id=41
      
      
      git-svn-id: http://leveldb.googlecode.com/svn/trunk@52 62dab493-f737-651d-591e-8d6aee1b9529
      ---
       build_detect_platform         |   5 +
       helpers/memenv/memenv_test.cc |   2 +-
       port/sha1_portable.cc         | 298 ------------------------------------------
       port/sha1_portable.h          |  25 ----
       port/sha1_test.cc             |  39 ------
       5 files changed, 6 insertions(+), 363 deletions(-)
       delete mode 100644 port/sha1_portable.cc
       delete mode 100644 port/sha1_portable.h
       delete mode 100644 port/sha1_test.cc
      
      diff --git a/build_detect_platform b/build_detect_platform
      index d1804e0..7f0df31 100644
      --- a/build_detect_platform
      +++ b/build_detect_platform
      @@ -35,6 +35,11 @@ case `uname -s` in
               echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_FREEBSD"  >> build_config.mk
               echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk
               ;;
      +    GNU/kFreeBSD)
      +        PLATFORM=OS_FREEBSD
      +        echo "PLATFORM_CFLAGS=-pthread -DOS_FREEBSD"  >> build_config.mk
      +        echo "PLATFORM_LDFLAGS=-lpthread -lrt" >> build_config.mk
      +        ;;
           *)
               echo "Unknown platform!"
               exit 1
      diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc
      index 30b0bb0..3791dc3 100644
      --- a/helpers/memenv/memenv_test.cc
      +++ b/helpers/memenv/memenv_test.cc
      @@ -26,7 +26,7 @@ class MemEnvTest {
       };
       
       TEST(MemEnvTest, Basics) {
      -  size_t file_size;
      +  uint64_t file_size;
         WritableFile* writable_file;
         std::vector children;
       
      diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc
      deleted file mode 100644
      index 8fa7277..0000000
      --- a/port/sha1_portable.cc
      +++ /dev/null
      @@ -1,298 +0,0 @@
      -// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved.
      -// Use of this source code is governed by a BSD-style license that can be
      -// found in the LICENSE file. See the AUTHORS file for names of contributors.
      -//
      -// This module provides a slow but portable implementation of
      -// the SHA1 hash function.
      -//
      -// It is adapted from free code written by Paul E. Jones
      -// .  See http://www.packetizer.com/security/sha1/
      -//
      -// The license for the original code is:
      -/*
      -  Copyright (C) 1998, 2009
      -  Paul E. Jones 
      -
      -  Freeware Public License (FPL)
      -
      -  This software is licensed as "freeware."  Permission to distribute
      -  this software in source and binary forms, including incorporation
      -  into other products, is hereby granted without a fee.  THIS SOFTWARE
      -  IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES,
      -  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      -  AND FITNESS FOR A PARTICULAR PURPOSE.  THE AUTHOR SHALL NOT BE HELD
      -  LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER
      -  DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA
      -  OR DATA BEING RENDERED INACCURATE.
      -*/
      -
      -#include "port/sha1_portable.h"
      -#include 
      -#include 
      -#include 
      -
      -namespace leveldb {
      -namespace port {
      -
      -/*
      - *  Description:
      - *      This class implements the Secure Hashing Standard as defined
      - *      in FIPS PUB 180-1 published April 17, 1995.
      - */
      -
      -/*
      - *  This structure will hold context information for the hashing
      - *  operation
      - */
      -typedef struct SHA1Context {
      -  unsigned Message_Digest[5]; /* Message Digest (output)          */
      -
      -  unsigned Length_Low;        /* Message length in bits           */
      -  unsigned Length_High;       /* Message length in bits           */
      -
      -  unsigned char Message_Block[64]; /* 512-bit message blocks      */
      -  int Message_Block_Index;    /* Index into message block array   */
      -
      -  bool Computed;               /* Is the digest computed?          */
      -  bool Corrupted;              /* Is the message digest corruped?  */
      -} SHA1Context;
      -
      -/*
      - *  Portability Issues:
      - *      SHA-1 is defined in terms of 32-bit "words".  This code was
      - *      written with the expectation that the processor has at least
      - *      a 32-bit machine word size.  If the machine word size is larger,
      - *      the code should still function properly.  One caveat to that
      - *      is that the input functions taking characters and character
      - *      arrays assume that only 8 bits of information are stored in each
      - *      character.
      - */
      -
      -/*
      - *  Define the circular shift macro
      - */
      -#define SHA1CircularShift(bits,word) \
      -                ((((word) << (bits)) & 0xFFFFFFFF) | \
      -                ((word) >> (32-(bits))))
      -
      -/* Function prototypes */
      -static void SHA1ProcessMessageBlock(SHA1Context *);
      -static void SHA1PadMessage(SHA1Context *);
      -
      -// Initialize the SHA1Context in preparation for computing a new
      -// message digest.
      -static void SHA1Reset(SHA1Context* context) {
      -  context->Length_Low             = 0;
      -  context->Length_High            = 0;
      -  context->Message_Block_Index    = 0;
      -
      -  context->Message_Digest[0]      = 0x67452301;
      -  context->Message_Digest[1]      = 0xEFCDAB89;
      -  context->Message_Digest[2]      = 0x98BADCFE;
      -  context->Message_Digest[3]      = 0x10325476;
      -  context->Message_Digest[4]      = 0xC3D2E1F0;
      -
      -  context->Computed   = false;
      -  context->Corrupted  = false;
      -}
      -
      -// This function will return the 160-bit message digest into the
      -// Message_Digest array within the SHA1Context provided
      -static bool SHA1Result(SHA1Context *context) {
      -  if (context->Corrupted) {
      -    return false;
      -  }
      -
      -  if (!context->Computed) {
      -    SHA1PadMessage(context);
      -    context->Computed = true;
      -  }
      -  return true;
      -}
      -
      -// This function accepts an array of bytes as the next portion of
      -// the message.
      -static void SHA1Input(SHA1Context         *context,
      -                      const unsigned char *message_array,
      -                      unsigned            length) {
      -  if (!length) return;
      -
      -  if (context->Computed || context->Corrupted) {
      -    context->Corrupted = true;
      -    return;
      -  }
      -
      -  while(length-- && !context->Corrupted) {
      -    context->Message_Block[context->Message_Block_Index++] =
      -        (*message_array & 0xFF);
      -
      -    context->Length_Low += 8;
      -    /* Force it to 32 bits */
      -    context->Length_Low &= 0xFFFFFFFF;
      -    if (context->Length_Low == 0) {
      -      context->Length_High++;
      -      /* Force it to 32 bits */
      -      context->Length_High &= 0xFFFFFFFF;
      -      if (context->Length_High == 0)
      -      {
      -        /* Message is too long */
      -        context->Corrupted = true;
      -      }
      -    }
      -
      -    if (context->Message_Block_Index == 64)
      -    {
      -      SHA1ProcessMessageBlock(context);
      -    }
      -
      -    message_array++;
      -  }
      -}
      -
      -// This function will process the next 512 bits of the message stored
      -// in the Message_Block array.
      -static void SHA1ProcessMessageBlock(SHA1Context *context) {
      -  const unsigned K[] =            // Constants defined in SHA-1
      -      {
      -        0x5A827999,
      -        0x6ED9EBA1,
      -        0x8F1BBCDC,
      -        0xCA62C1D6
      -      };
      -  int         t;                  // Loop counter
      -  unsigned    temp;               // Temporary word value
      -  unsigned    W[80];            // Word sequence
      -  unsigned    A, B, C, D, E;    // Word buffers
      -
      -  // Initialize the first 16 words in the array W
      -  for(t = 0; t < 16; t++) {
      -    W[t] = ((unsigned) context->Message_Block[t * 4]) << 24;
      -    W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16;
      -    W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8;
      -    W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]);
      -  }
      -
      -  for(t = 16; t < 80; t++) {
      -    W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]);
      -  }
      -
      -  A = context->Message_Digest[0];
      -  B = context->Message_Digest[1];
      -  C = context->Message_Digest[2];
      -  D = context->Message_Digest[3];
      -  E = context->Message_Digest[4];
      -
      -  for(t = 0; t < 20; t++) {
      -    temp =  SHA1CircularShift(5,A) +
      -        ((B & C) | ((~B) & D)) + E + W[t] + K[0];
      -    temp &= 0xFFFFFFFF;
      -    E = D;
      -    D = C;
      -    C = SHA1CircularShift(30,B);
      -    B = A;
      -    A = temp;
      -  }
      -
      -  for(t = 20; t < 40; t++) {
      -    temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1];
      -    temp &= 0xFFFFFFFF;
      -    E = D;
      -    D = C;
      -    C = SHA1CircularShift(30,B);
      -    B = A;
      -    A = temp;
      -  }
      -
      -  for(t = 40; t < 60; t++) {
      -    temp = SHA1CircularShift(5,A) +
      -        ((B & C) | (B & D) | (C & D)) + E + W[t] + K[2];
      -    temp &= 0xFFFFFFFF;
      -    E = D;
      -    D = C;
      -    C = SHA1CircularShift(30,B);
      -    B = A;
      -    A = temp;
      -  }
      -
      -  for(t = 60; t < 80; t++) {
      -    temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3];
      -    temp &= 0xFFFFFFFF;
      -    E = D;
      -    D = C;
      -    C = SHA1CircularShift(30,B);
      -    B = A;
      -    A = temp;
      -  }
      -
      -  context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF;
      -  context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF;
      -  context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF;
      -  context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF;
      -  context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF;
      -
      -  context->Message_Block_Index = 0;
      -}
      -
      -// According to the standard, the message must be padded to an even
      -// 512 bits.  The first padding bit must be a '1'.  The last 64 bits
      -// represent the length of the original message.  All bits in between
      -// should be 0.  This function will pad the message according to those
      -// rules by filling the Message_Block array accordingly.  It will also
      -// call SHA1ProcessMessageBlock() appropriately.  When it returns, it
      -// can be assumed that the message digest has been computed.
      -static void SHA1PadMessage(SHA1Context *context) {
      -  // Check to see if the current message block is too small to hold
      -  // the initial padding bits and length.  If so, we will pad the
      -  // block, process it, and then continue padding into a second block.
      -  if (context->Message_Block_Index > 55) {
      -    context->Message_Block[context->Message_Block_Index++] = 0x80;
      -    while(context->Message_Block_Index < 64) {
      -      context->Message_Block[context->Message_Block_Index++] = 0;
      -    }
      -
      -    SHA1ProcessMessageBlock(context);
      -
      -    while(context->Message_Block_Index < 56) {
      -      context->Message_Block[context->Message_Block_Index++] = 0;
      -    }
      -  } else {
      -    context->Message_Block[context->Message_Block_Index++] = 0x80;
      -    while(context->Message_Block_Index < 56) {
      -      context->Message_Block[context->Message_Block_Index++] = 0;
      -    }
      -  }
      -
      -  // Store the message length as the last 8 octets
      -  context->Message_Block[56] = (context->Length_High >> 24) & 0xFF;
      -  context->Message_Block[57] = (context->Length_High >> 16) & 0xFF;
      -  context->Message_Block[58] = (context->Length_High >> 8) & 0xFF;
      -  context->Message_Block[59] = (context->Length_High) & 0xFF;
      -  context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF;
      -  context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF;
      -  context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF;
      -  context->Message_Block[63] = (context->Length_Low) & 0xFF;
      -
      -  SHA1ProcessMessageBlock(context);
      -}
      -
      -
      -void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) {
      -  SHA1Context context;
      -  SHA1Reset(&context);
      -  SHA1Input(&context, reinterpret_cast(data), len);
      -  bool ok = SHA1Result(&context);
      -  if (!ok) {
      -    fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n");
      -    exit(1);
      -  }
      -  for (int i = 0; i < 5; i++) {
      -    uint32_t value = context.Message_Digest[i];
      -    hash_array[i*4 + 0] = (value >> 24) & 0xff;
      -    hash_array[i*4 + 1] = (value >> 16) & 0xff;
      -    hash_array[i*4 + 2] = (value >> 8) & 0xff;
      -    hash_array[i*4 + 3] = value & 0xff;
      -  }
      -}
      -
      -}
      -}
      diff --git a/port/sha1_portable.h b/port/sha1_portable.h
      deleted file mode 100644
      index 31db305..0000000
      --- a/port/sha1_portable.h
      +++ /dev/null
      @@ -1,25 +0,0 @@
      -// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
      -// Use of this source code is governed by a BSD-style license that can be
      -// found in the LICENSE file. See the AUTHORS file for names of contributors.
      -
      -#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
      -#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
      -
      -#include 
      -
      -namespace leveldb {
      -namespace port {
      -
      -// Compute the SHA1 hash value of "data[0..len-1]" and store it in
      -// "hash_array[0..19]".  hash_array must have 20 bytes of space available.
      -//
      -// This function is portable but may not be as fast as a version
      -// optimized for your platform.  It is provided as a default method
      -// that can be used when porting leveldb to a new platform if no
      -// better SHA1 hash implementation is available.
      -void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array);
      -
      -}
      -}
      -
      -#endif  // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
      diff --git a/port/sha1_test.cc b/port/sha1_test.cc
      deleted file mode 100644
      index b182e67..0000000
      --- a/port/sha1_test.cc
      +++ /dev/null
      @@ -1,39 +0,0 @@
      -// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
      -// Use of this source code is governed by a BSD-style license that can be
      -// found in the LICENSE file. See the AUTHORS file for names of contributors.
      -
      -#include "port/port.h"
      -#include "util/testharness.h"
      -
      -namespace leveldb {
      -namespace port {
      -
      -class SHA1 { };
      -
      -static std::string TestSHA1(const char* data, size_t len) {
      -  char hash_val[20];
      -  SHA1_Hash(data, len, hash_val);
      -  char buf[41];
      -  for (int i = 0; i < 20; i++) {
      -    snprintf(buf + i * 2, 41 - i * 2,
      -             "%02x",
      -             static_cast(static_cast(
      -                 hash_val[i])));
      -  }
      -  return std::string(buf, 40);
      -}
      -
      -TEST(SHA1, Simple) {
      -  ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0));
      -  ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5));
      -  std::string x(10000, 'x');
      -  ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75",
      -            TestSHA1(x.data(), x.size()));
      -}
      -
      -}
      -}
      -
      -int main(int argc, char** argv) {
      -  return leveldb::test::RunAllTests();
      -}
      -- 
      cgit v1.2.3
      
      
      From 5fb21ed7ac9e91010d473ac77e132ae68f348d6a Mon Sep 17 00:00:00 2001
      From: "gabor@google.com"
       
      Date: Wed, 5 Oct 2011 23:34:24 +0000
      Subject: A number of bugfixes:
      
      - Added DB::CompactRange() method.
      
        Changed manual compaction code so it breaks up compactions of
        big ranges into smaller compactions.
      
        Changed the code that pushes the output of memtable compactions
        to higher levels to obey the grandparent constraint: i.e., we
        must never have a single file in level L that overlaps too
        much data in level L+1 (to avoid very expensive L-1 compactions).
      
        Added code to pretty-print internal keys.
      
      - Fixed bug where we would not detect overlap with files in
        level-0 because we were incorrectly using binary search
        on an array of files with overlapping ranges.
      
        Added "leveldb.sstables" property that can be used to dump
        all of the sstables and ranges that make up the db state.
      
      - Removing post_write_snapshot support.  Email to leveldb mailing
        list brought up no users, just confusion from one person about
        what it meant.
      
      - Fixing static_cast char to unsigned on BIG_ENDIAN platforms.
      
        Fixes	 Issue 35 and Issue 36.
      
      - Comment clarification to address leveldb Issue 37.
      
      - Change license in posix_logger.h to match other files.
      
      - A build problem where uint32 was used instead of uint32_t.
      
      Sync with upstream @24408625
      
      
      
      git-svn-id: http://leveldb.googlecode.com/svn/trunk@53 62dab493-f737-651d-591e-8d6aee1b9529
      ---
       build_detect_platform     |   5 --
       db/corruption_test.cc     |   6 +-
       db/db_bench.cc            |  15 +---
       db/db_impl.cc             |  98 +++++++++++++++-------
       db/db_impl.h              |  14 ++--
       db/db_test.cc             | 159 ++++++++++++++++++++++++++++-------
       db/dbformat.cc            |  12 +++
       db/dbformat.h             |   2 +
       db/version_edit.cc        |  14 ++--
       db/version_set.cc         | 205 ++++++++++++++++++++++++++++++++--------------
       db/version_set.h          |  40 +++++----
       db/version_set_test.cc    |  57 ++++++++++++-
       doc/index.html            |  22 +----
       include/leveldb/db.h      |  15 +++-
       include/leveldb/env.h     |   8 +-
       include/leveldb/options.h |  15 +---
       util/coding.h             |   8 +-
       util/posix_logger.h       |   5 +-
       18 files changed, 483 insertions(+), 217 deletions(-)
      
      diff --git a/build_detect_platform b/build_detect_platform
      index 7f0df31..d1804e0 100644
      --- a/build_detect_platform
      +++ b/build_detect_platform
      @@ -35,11 +35,6 @@ case `uname -s` in
               echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_FREEBSD"  >> build_config.mk
               echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk
               ;;
      -    GNU/kFreeBSD)
      -        PLATFORM=OS_FREEBSD
      -        echo "PLATFORM_CFLAGS=-pthread -DOS_FREEBSD"  >> build_config.mk
      -        echo "PLATFORM_LDFLAGS=-lpthread -lrt" >> build_config.mk
      -        ;;
           *)
               echo "Unknown platform!"
               exit 1
      diff --git a/db/corruption_test.cc b/db/corruption_test.cc
      index 69fa03a..1edcd84 100644
      --- a/db/corruption_test.cc
      +++ b/db/corruption_test.cc
      @@ -229,8 +229,8 @@ TEST(CorruptionTest, TableFile) {
         Build(100);
         DBImpl* dbi = reinterpret_cast(db_);
         dbi->TEST_CompactMemTable();
      -  dbi->TEST_CompactRange(0, "", "~");
      -  dbi->TEST_CompactRange(1, "", "~");
      +  dbi->TEST_CompactRange(0, NULL, NULL);
      +  dbi->TEST_CompactRange(1, NULL, NULL);
       
         Corrupt(kTableFile, 100, 1);
         Check(99, 99);
      @@ -278,7 +278,7 @@ TEST(CorruptionTest, CorruptedDescriptor) {
         ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
         DBImpl* dbi = reinterpret_cast(db_);
         dbi->TEST_CompactMemTable();
      -  dbi->TEST_CompactRange(0, "", "~");
      +  dbi->TEST_CompactRange(0, NULL, NULL);
       
         Corrupt(kDescriptorFile, 0, 1000);
         Status s = TryReopen();
      diff --git a/db/db_bench.cc b/db/db_bench.cc
      index bb63e59..cf9bb65 100644
      --- a/db/db_bench.cc
      +++ b/db/db_bench.cc
      @@ -796,20 +796,7 @@ class Benchmark {
         }
       
         void Compact(ThreadState* thread) {
      -    DBImpl* dbi = reinterpret_cast(db_);
      -    dbi->TEST_CompactMemTable();
      -    int max_level_with_files = 1;
      -    for (int level = 1; level < config::kNumLevels; level++) {
      -      std::string property;
      -      char name[100];
      -      snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
      -      if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) {
      -        max_level_with_files = level;
      -      }
      -    }
      -    for (int level = 0; level < max_level_with_files; level++) {
      -      dbi->TEST_CompactRange(level, "", "~");
      -    }
      +    db_->CompactRange(NULL, NULL);
         }
       
         void PrintStats() {
      diff --git a/db/db_impl.cc b/db/db_impl.cc
      index 0ca6386..56182a0 100644
      --- a/db/db_impl.cc
      +++ b/db/db_impl.cc
      @@ -454,13 +454,8 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
         if (s.ok() && meta.file_size > 0) {
           const Slice min_user_key = meta.smallest.user_key();
           const Slice max_user_key = meta.largest.user_key();
      -    if (base != NULL && !base->OverlapInLevel(0, min_user_key, max_user_key)) {
      -      // Push the new sstable to a higher level if possible to reduce
      -      // expensive manifest file ops.
      -      while (level < config::kMaxMemCompactLevel &&
      -             !base->OverlapInLevel(level + 1, min_user_key, max_user_key)) {
      -        level++;
      -      }
      +    if (base != NULL) {
      +      level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
           }
           edit->AddFile(level, meta.number, meta.file_size,
                         meta.smallest, meta.largest);
      @@ -506,25 +501,55 @@ Status DBImpl::CompactMemTable() {
         return s;
       }
       
      -void DBImpl::TEST_CompactRange(
      -    int level,
      -    const std::string& begin,
      -    const std::string& end) {
      +void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
      +  int max_level_with_files = 1;
      +  {
      +    MutexLock l(&mutex_);
      +    Version* base = versions_->current();
      +    for (int level = 1; level < config::kNumLevels; level++) {
      +      if (base->OverlapInLevel(level, begin, end)) {
      +        max_level_with_files = level;
      +      }
      +    }
      +  }
      +  TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
      +  for (int level = 0; level < max_level_with_files; level++) {
      +    TEST_CompactRange(level, begin, end);
      +  }
      +}
      +
      +void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
         assert(level >= 0);
         assert(level + 1 < config::kNumLevels);
       
      -  MutexLock l(&mutex_);
      -  while (manual_compaction_ != NULL) {
      -    bg_cv_.Wait();
      -  }
      +  InternalKey begin_storage, end_storage;
      +
         ManualCompaction manual;
         manual.level = level;
      -  manual.begin = begin;
      -  manual.end = end;
      -  manual_compaction_ = &manual;
      -  MaybeScheduleCompaction();
      -  while (manual_compaction_ == &manual) {
      -    bg_cv_.Wait();
      +  manual.done = false;
      +  if (begin == NULL) {
      +    manual.begin = NULL;
      +  } else {
      +    begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
      +    manual.begin = &begin_storage;
      +  }
      +  if (end == NULL) {
      +    manual.end = NULL;
      +  } else {
      +    end_storage = InternalKey(*end, 0, static_cast(0));
      +    manual.end = &end_storage;
      +  }
      +
      +  MutexLock l(&mutex_);
      +  while (!manual.done) {
      +    while (manual_compaction_ != NULL) {
      +      bg_cv_.Wait();
      +    }
      +    manual_compaction_ = &manual;
      +    MaybeScheduleCompaction();
      +    while (manual_compaction_ == &manual) {
      +      bg_cv_.Wait();
      +    }
         }
       }
       
      @@ -590,12 +615,20 @@ void DBImpl::BackgroundCompaction() {
       
         Compaction* c;
         bool is_manual = (manual_compaction_ != NULL);
      +  InternalKey manual_end;
         if (is_manual) {
      -    const ManualCompaction* m = manual_compaction_;
      -    c = versions_->CompactRange(
      +    ManualCompaction* m = manual_compaction_;
      +    c = versions_->CompactRange(m->level, m->begin, m->end);
      +    m->done = (c == NULL);
      +    if (c != NULL) {
      +      manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
      +    }
      +    Log(options_.info_log,
      +        "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
               m->level,
      -        InternalKey(m->begin, kMaxSequenceNumber, kValueTypeForSeek),
      -        InternalKey(m->end, 0, static_cast(0)));
      +        (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
      +        (m->end ? m->end->DebugString().c_str() : "(end)"),
      +        (m->done ? "(end)" : manual_end.DebugString().c_str()));
         } else {
           c = versions_->PickCompaction();
         }
      @@ -638,7 +671,13 @@ void DBImpl::BackgroundCompaction() {
         }
       
         if (is_manual) {
      -    // Mark it as done
      +    ManualCompaction* m = manual_compaction_;
      +    if (!m->done) {
      +      // We only compacted part of the requested range.  Update *m
      +      // to the range that is left to be compacted.
      +      m->tmp_storage = manual_end;
      +      m->begin = &m->tmp_storage;
      +    }
           manual_compaction_ = NULL;
         }
       }
      @@ -1109,10 +1148,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
       
           versions_->SetLastSequence(last_sequence);
         }
      -  if (options.post_write_snapshot != NULL) {
      -    *options.post_write_snapshot =
      -        status.ok() ? snapshots_.New(last_sequence) : NULL;
      -  }
         ReleaseLoggingResponsibility(&self);
         return status;
       }
      @@ -1225,6 +1260,9 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
             }
           }
           return true;
      +  } else if (in == "sstables") {
      +    *value = versions_->current()->DebugString();
      +    return true;
         }
       
         return false;
      diff --git a/db/db_impl.h b/db/db_impl.h
      index 5268137..ab03181 100644
      --- a/db/db_impl.h
      +++ b/db/db_impl.h
      @@ -38,14 +38,12 @@ class DBImpl : public DB {
         virtual void ReleaseSnapshot(const Snapshot* snapshot);
         virtual bool GetProperty(const Slice& property, std::string* value);
         virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
      +  virtual void CompactRange(const Slice* begin, const Slice* end);
       
         // Extra methods (for testing) that are not in the public DB interface
       
      -  // Compact any files in the named level that overlap [begin,end]
      -  void TEST_CompactRange(
      -      int level,
      -      const std::string& begin,
      -      const std::string& end);
      +  // Compact any files in the named level that overlap [*begin,*end]
      +  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
       
         // Force current memtable contents to be compacted.
         Status TEST_CompactMemTable();
      @@ -145,8 +143,10 @@ class DBImpl : public DB {
         // Information for a manual compaction
         struct ManualCompaction {
           int level;
      -    std::string begin;
      -    std::string end;
      +    bool done;
      +    const InternalKey* begin;   // NULL means beginning of key range
      +    const InternalKey* end;     // NULL means end of key range
      +    InternalKey tmp_storage;    // Used to keep track of compaction progress
         };
         ManualCompaction* manual_compaction_;
       
      diff --git a/db/db_test.cc b/db/db_test.cc
      index daa9c03..ab71c51 100644
      --- a/db/db_test.cc
      +++ b/db/db_test.cc
      @@ -195,6 +195,23 @@ class DBTest {
           return result;
         }
       
      +  // Return spread of files per level
      +  std::string FilesPerLevel() {
      +    std::string result;
      +    int last_non_zero_offset = 0;
      +    for (int level = 0; level < config::kNumLevels; level++) {
      +      int f = NumTableFilesAtLevel(level);
      +      char buf[100];
      +      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
      +      result += buf;
      +      if (f > 0) {
      +        last_non_zero_offset = result.size();
      +      }
      +    }
      +    result.resize(last_non_zero_offset);
      +    return result;
      +  }
      +
         uint64_t Size(const Slice& start, const Slice& limit) {
           Range r(start, limit);
           uint64_t size;
      @@ -203,26 +220,23 @@ class DBTest {
         }
       
         void Compact(const Slice& start, const Slice& limit) {
      -    dbfull()->TEST_CompactMemTable();
      -    int max_level_with_files = 1;
      -    for (int level = 1; level < config::kNumLevels; level++) {
      -      if (NumTableFilesAtLevel(level) > 0) {
      -        max_level_with_files = level;
      -      }
      -    }
      -    for (int level = 0; level < max_level_with_files; level++) {
      -      dbfull()->TEST_CompactRange(level, "", "~");
      +    db_->CompactRange(&start, &limit);
      +  }
      +
      +  // Do n memtable compactions, each of which produces an sstable
      +  // covering the range [small,large].
      +  void MakeTables(int n, const std::string& small, const std::string& large) {
      +    for (int i = 0; i < n; i++) {
      +      Put(small, "begin");
      +      Put(large, "end");
      +      dbfull()->TEST_CompactMemTable();
           }
         }
       
         // Prevent pushing of new sstables into deeper levels by adding
         // tables that cover a specified range to all levels.
         void FillLevels(const std::string& smallest, const std::string& largest) {
      -    for (int level = 0; level < config::kNumLevels; level++) {
      -      Put(smallest, "begin");
      -      Put(largest, "end");
      -      dbfull()->TEST_CompactMemTable();
      -    }
      +    MakeTables(config::kNumLevels, smallest, largest);
         }
       
         void DumpFileCounts(const char* label) {
      @@ -238,6 +252,12 @@ class DBTest {
           }
         }
       
      +  std::string DumpSSTableList() {
      +    std::string property;
      +    db_->GetProperty("leveldb.sstables", &property);
      +    return property;
      +  }
      +
         std::string IterStatus(Iterator* iter) {
           std::string result;
           if (iter->Valid()) {
      @@ -367,7 +387,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
         }
       
         // Step 2: clear level 1 if necessary.
      -  dbfull()->TEST_CompactRange(1, "a", "z");
      +  dbfull()->TEST_CompactRange(1, NULL, NULL);
         ASSERT_EQ(NumTableFilesAtLevel(0), 1);
         ASSERT_EQ(NumTableFilesAtLevel(1), 0);
         ASSERT_EQ(NumTableFilesAtLevel(2), 1);
      @@ -693,7 +713,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
       
         // Reopening moves updates to level-0
         Reopen(&options);
      -  dbfull()->TEST_CompactRange(0, "", Key(100000));
      +  dbfull()->TEST_CompactRange(0, NULL, NULL);
       
         ASSERT_EQ(NumTableFilesAtLevel(0), 0);
         ASSERT_GT(NumTableFilesAtLevel(1), 1);
      @@ -744,7 +764,7 @@ TEST(DBTest, SparseMerge) {
         }
         Put("C", "vc");
         dbfull()->TEST_CompactMemTable();
      -  dbfull()->TEST_CompactRange(0, "A", "Z");
      +  dbfull()->TEST_CompactRange(0, NULL, NULL);
       
         // Make sparse update
         Put("A",    "va2");
      @@ -755,9 +775,9 @@ TEST(DBTest, SparseMerge) {
         // Compactions should not cause us to create a situation where
         // a file overlaps too much data at the next level.
         ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
      -  dbfull()->TEST_CompactRange(0, "", "z");
      +  dbfull()->TEST_CompactRange(0, NULL, NULL);
         ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
      -  dbfull()->TEST_CompactRange(1, "", "z");
      +  dbfull()->TEST_CompactRange(1, NULL, NULL);
         ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
       }
       
      @@ -808,9 +828,11 @@ TEST(DBTest, ApproximateSizes) {
             ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000));
             ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000));
       
      -      dbfull()->TEST_CompactRange(0,
      -                                  Key(compact_start),
      -                                  Key(compact_start + 9));
      +      std::string cstart_str = Key(compact_start);
      +      std::string cend_str = Key(compact_start + 9);
      +      Slice cstart = cstart_str;
      +      Slice cend = cend_str;
      +      dbfull()->TEST_CompactRange(0, &cstart, &cend);
           }
       
           ASSERT_EQ(NumTableFilesAtLevel(0), 0);
      @@ -850,7 +872,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
       
           ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
       
      -    dbfull()->TEST_CompactRange(0, Key(0), Key(100));
      +    dbfull()->TEST_CompactRange(0, NULL, NULL);
         }
       }
       
      @@ -921,11 +943,12 @@ TEST(DBTest, HiddenValuesAreRemoved) {
         ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
         db_->ReleaseSnapshot(snapshot);
         ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
      -  dbfull()->TEST_CompactRange(0, "", "x");
      +  Slice x("x");
      +  dbfull()->TEST_CompactRange(0, NULL, &x);
         ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
         ASSERT_EQ(NumTableFilesAtLevel(0), 0);
         ASSERT_GE(NumTableFilesAtLevel(1), 1);
      -  dbfull()->TEST_CompactRange(1, "", "x");
      +  dbfull()->TEST_CompactRange(1, NULL, &x);
         ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
       
         ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
      @@ -949,11 +972,12 @@ TEST(DBTest, DeletionMarkers1) {
         ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
         ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level last-2
         ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
      -  dbfull()->TEST_CompactRange(last-2, "", "z");
      +  Slice z("z");
      +  dbfull()->TEST_CompactRange(last-2, NULL, &z);
         // DEL eliminated, but v1 remains because we aren't compacting that level
         // (DEL can be eliminated because v2 hides v1).
         ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
      -  dbfull()->TEST_CompactRange(last-1, "", "z");
      +  dbfull()->TEST_CompactRange(last-1, NULL, NULL);
         // Merging last-1 w/ last, so we are the base level for "foo", so
         // DEL is removed.  (as is v1).
         ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
      @@ -976,15 +1000,54 @@ TEST(DBTest, DeletionMarkers2) {
         ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
         ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level last-2
         ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
      -  dbfull()->TEST_CompactRange(last-2, "", "z");
      +  dbfull()->TEST_CompactRange(last-2, NULL, NULL);
         // DEL kept: "last" file overlaps
         ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
      -  dbfull()->TEST_CompactRange(last-1, "", "z");
      +  dbfull()->TEST_CompactRange(last-1, NULL, NULL);
         // Merging last-1 w/ last, so we are the base level for "foo", so
         // DEL is removed.  (as is v1).
         ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
       }
       
      +TEST(DBTest, OverlapInLevel0) {
      +  ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
      +
      +  // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
      +  ASSERT_OK(Put("100", "v100"));
      +  ASSERT_OK(Put("999", "v999"));
      +  dbfull()->TEST_CompactMemTable();
      +  ASSERT_OK(Delete("100"));
      +  ASSERT_OK(Delete("999"));
      +  dbfull()->TEST_CompactMemTable();
      +  ASSERT_EQ("0,1,1", FilesPerLevel());
      +
      +  // Make files spanning the following ranges in level-0:
      +  //  files[0]  200 .. 900
      +  //  files[1]  300 .. 500
      +  // Note that files are sorted by smallest key.
      +  ASSERT_OK(Put("300", "v300"));
      +  ASSERT_OK(Put("500", "v500"));
      +  dbfull()->TEST_CompactMemTable();
      +  ASSERT_OK(Put("200", "v200"));
      +  ASSERT_OK(Put("600", "v600"));
      +  ASSERT_OK(Put("900", "v900"));
      +  dbfull()->TEST_CompactMemTable();
      +  ASSERT_EQ("2,1,1", FilesPerLevel());
      +
      +  // Compact away the placeholder files we created initially
      +  dbfull()->TEST_CompactRange(1, NULL, NULL);
      +  dbfull()->TEST_CompactRange(2, NULL, NULL);
      +  ASSERT_EQ("2", FilesPerLevel());
      +
      +  // Do a memtable compaction.  Before bug-fix, the compaction would
      +  // not detect the overlap with level-0 files and would incorrectly place
      +  // the deletion in a deeper level.
      +  ASSERT_OK(Delete("600"));
      +  dbfull()->TEST_CompactMemTable();
      +  ASSERT_EQ("3", FilesPerLevel());
      +  ASSERT_EQ("NOT_FOUND", Get("600"));
      +}
      +
       TEST(DBTest, ComparatorCheck) {
         class NewComparator : public Comparator {
          public:
      @@ -1008,6 +1071,40 @@ TEST(DBTest, ComparatorCheck) {
             << s.ToString();
       }
       
      +TEST(DBTest, ManualCompaction) {
      +  ASSERT_EQ(config::kMaxMemCompactLevel, 2)
      +      << "Need to update this test to match kMaxMemCompactLevel";
      +
      +  MakeTables(3, "p", "q");
      +  ASSERT_EQ("1,1,1", FilesPerLevel());
      +
      +  // Compaction range falls before files
      +  Compact("", "c");
      +  ASSERT_EQ("1,1,1", FilesPerLevel());
      +
      +  // Compaction range falls after files
      +  Compact("r", "z");
      +  ASSERT_EQ("1,1,1", FilesPerLevel());
      +
      +  // Compaction range overlaps files
      +  Compact("p1", "p9");
      +  ASSERT_EQ("0,0,1", FilesPerLevel());
      +
      +  // Populate a different range
      +  MakeTables(3, "c", "e");
      +  ASSERT_EQ("1,1,2", FilesPerLevel());
      +
      +  // Compact just the new range
      +  Compact("b", "f");
      +  ASSERT_EQ("0,0,2", FilesPerLevel());
      +
      +  // Compact all
      +  MakeTables(1, "a", "z");
      +  ASSERT_EQ("0,1,2", FilesPerLevel());
      +  db_->CompactRange(NULL, NULL);
      +  ASSERT_EQ("0,0,1", FilesPerLevel());
      +}
      +
       TEST(DBTest, DBOpen_Options) {
         std::string dbname = test::TmpDir() + "/db_options_test";
         DestroyDB(dbname, Options());
      @@ -1187,7 +1284,6 @@ class ModelDB: public DB {
           delete reinterpret_cast(snapshot);
         }
         virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
      -    assert(options.post_write_snapshot == NULL);   // Not supported
           class Handler : public WriteBatch::Handler {
            public:
             KVMap* map_;
      @@ -1211,6 +1307,9 @@ class ModelDB: public DB {
             sizes[i] = 0;
           }
         }
      +  virtual void CompactRange(const Slice* start, const Slice* end) {
      +  }
      +
        private:
         class ModelIter: public Iterator {
          public:
      diff --git a/db/dbformat.cc b/db/dbformat.cc
      index af2e077..4fb3531 100644
      --- a/db/dbformat.cc
      +++ b/db/dbformat.cc
      @@ -31,6 +31,18 @@ std::string ParsedInternalKey::DebugString() const {
         return result;
       }
       
      +std::string InternalKey::DebugString() const {
      +  std::string result;
      +  ParsedInternalKey parsed;
      +  if (ParseInternalKey(rep_, &parsed)) {
      +    result = parsed.DebugString();
      +  } else {
      +    result = "(bad)";
      +    result.append(EscapeString(rep_));
      +  }
      +  return result;
      +}
      +
       const char* InternalKeyComparator::Name() const {
         return "leveldb.InternalKeyComparator";
       }
      diff --git a/db/dbformat.h b/db/dbformat.h
      index 7344cbf..d046990 100644
      --- a/db/dbformat.h
      +++ b/db/dbformat.h
      @@ -149,6 +149,8 @@ class InternalKey {
         }
       
         void Clear() { rep_.clear(); }
      +
      +  std::string DebugString() const;
       };
       
       inline int InternalKeyComparator::Compare(
      diff --git a/db/version_edit.cc b/db/version_edit.cc
      index f6b9e9c..9891c32 100644
      --- a/db/version_edit.cc
      +++ b/db/version_edit.cc
      @@ -235,9 +235,8 @@ std::string VersionEdit::DebugString() const {
         for (size_t i = 0; i < compact_pointers_.size(); i++) {
           r.append("\n  CompactPointer: ");
           AppendNumberTo(&r, compact_pointers_[i].first);
      -    r.append(" '");
      -    AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode());
      -    r.append("'");
      +    r.append(" ");
      +    r.append(compact_pointers_[i].second.DebugString());
         }
         for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
              iter != deleted_files_.end();
      @@ -255,11 +254,10 @@ std::string VersionEdit::DebugString() const {
           AppendNumberTo(&r, f.number);
           r.append(" ");
           AppendNumberTo(&r, f.file_size);
      -    r.append(" '");
      -    AppendEscapedStringTo(&r, f.smallest.Encode());
      -    r.append("' .. '");
      -    AppendEscapedStringTo(&r, f.largest.Encode());
      -    r.append("'");
      +    r.append(" ");
      +    r.append(f.smallest.DebugString());
      +    r.append(" .. ");
      +    r.append(f.largest.DebugString());
         }
         r.append("\n}\n");
         return r;
      diff --git a/db/version_set.cc b/db/version_set.cc
      index d75b347..8b96af0 100644
      --- a/db/version_set.cc
      +++ b/db/version_set.cc
      @@ -41,6 +41,14 @@ static uint64_t MaxFileSizeForLevel(int level) {
         return kTargetFileSize;  // We could vary per level to reduce number of files?
       }
       
      +static int64_t TotalFileSize(const std::vector& files) {
      +  int64_t sum = 0;
      +  for (size_t i = 0; i < files.size(); i++) {
      +    sum += files[i]->file_size;
      +  }
      +  return sum;
      +}
      +
       namespace {
       std::string IntSetToString(const std::set& s) {
         std::string result = "{";
      @@ -96,17 +104,55 @@ int FindFile(const InternalKeyComparator& icmp,
         return right;
       }
       
      +static bool AfterFile(const Comparator* ucmp,
      +                      const Slice* user_key, const FileMetaData* f) {
      +  // NULL user_key occurs before all keys and is therefore never after *f
      +  return (user_key != NULL &&
      +          ucmp->Compare(*user_key, f->largest.user_key()) > 0);
      +}
      +
      +static bool BeforeFile(const Comparator* ucmp,
      +                       const Slice* user_key, const FileMetaData* f) {
      +  // NULL user_key occurs after all keys and is therefore never before *f
      +  return (user_key != NULL &&
      +          ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
      +}
      +
       bool SomeFileOverlapsRange(
           const InternalKeyComparator& icmp,
      +    bool disjoint_sorted_files,
           const std::vector& files,
      -    const Slice& smallest_user_key,
      -    const Slice& largest_user_key) {
      -  // Find the earliest possible internal key for smallest_user_key
      -  InternalKey small(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
      -  const uint32_t index = FindFile(icmp, files, small.Encode());
      -  return ((index < files.size()) &&
      -          icmp.user_comparator()->Compare(
      -              largest_user_key, files[index]->smallest.user_key()) >= 0);
      +    const Slice* smallest_user_key,
      +    const Slice* largest_user_key) {
      +  const Comparator* ucmp = icmp.user_comparator();
      +  if (!disjoint_sorted_files) {
      +    // Need to check against all files
      +    for (int i = 0; i < files.size(); i++) {
      +      const FileMetaData* f = files[i];
      +      if (AfterFile(ucmp, smallest_user_key, f) ||
      +          BeforeFile(ucmp, largest_user_key, f)) {
      +        // No overlap
      +      } else {
      +        return true;  // Overlap
      +      }
      +    }
      +    return false;
      +  }
      +
      +  // Binary search over file list
      +  uint32_t index = 0;
      +  if (smallest_user_key != NULL) {
      +    // Find the earliest possible internal key for smallest_user_key
      +    InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
      +    index = FindFile(icmp, files, small.Encode());
      +  }
      +
      +  if (index >= files.size()) {
      +    // beginning of range is after all files, so no overlap.
      +    return false;
      +  }
      +
      +  return !BeforeFile(ucmp, largest_user_key, files[index]);
       }
       
       // An internal iterator.  For a given version/level pair, yields
      @@ -358,11 +404,64 @@ void Version::Unref() {
       }
       
       bool Version::OverlapInLevel(int level,
      -                             const Slice& smallest_user_key,
      -                             const Slice& largest_user_key) {
      -  return SomeFileOverlapsRange(vset_->icmp_, files_[level],
      -                               smallest_user_key,
      -                               largest_user_key);
      +                             const Slice* smallest_user_key,
      +                             const Slice* largest_user_key) {
      +  return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
      +                               smallest_user_key, largest_user_key);
      +}
      +
      +int Version::PickLevelForMemTableOutput(
      +    const Slice& smallest_user_key,
      +    const Slice& largest_user_key) {
      +  int level = 0;
      +  if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
      +    // Push to next level if there is no overlap in next level,
      +    // and the #bytes overlapping in the level after that are limited.
      +    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
      +    InternalKey limit(largest_user_key, 0, static_cast(0));
      +    std::vector overlaps;
      +    while (level < config::kMaxMemCompactLevel) {
      +      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
      +        break;
      +      }
      +      GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
      +      const int64_t sum = TotalFileSize(overlaps);
      +      if (sum > kMaxGrandParentOverlapBytes) {
      +        break;
      +      }
      +      level++;
      +    }
      +  }
      +  return level;
      +}
      +
      +// Store in "*inputs" all files in "level" that overlap [begin,end]
      +void Version::GetOverlappingInputs(
      +    int level,
      +    const InternalKey* begin,
      +    const InternalKey* end,
      +    std::vector* inputs) {
      +  inputs->clear();
      +  Slice user_begin, user_end;
      +  if (begin != NULL) {
      +    user_begin = begin->user_key();
      +  }
      +  if (end != NULL) {
      +    user_end = end->user_key();
      +  }
      +  const Comparator* user_cmp = vset_->icmp_.user_comparator();
      +  for (size_t i = 0; i < files_[level].size(); i++) {
      +    FileMetaData* f = files_[level][i];
      +    if (begin != NULL &&
      +        user_cmp->Compare(f->largest.user_key(), user_begin) < 0) {
      +      // "f" is completely before specified range; skip it
      +    } else if (end != NULL &&
      +               user_cmp->Compare(f->smallest.user_key(), user_end) > 0) {
      +      // "f" is completely after specified range; skip it
      +    } else {
      +      inputs->push_back(f);
      +    }
      +  }
       }
       
       std::string Version::DebugString() const {
      @@ -381,11 +480,11 @@ std::string Version::DebugString() const {
             AppendNumberTo(&r, files[i]->number);
             r.push_back(':');
             AppendNumberTo(&r, files[i]->file_size);
      -      r.append("['");
      -      AppendEscapedStringTo(&r, files[i]->smallest.Encode());
      -      r.append("' .. '");
      -      AppendEscapedStringTo(&r, files[i]->largest.Encode());
      -      r.append("']\n");
      +      r.append("[");
      +      r.append(files[i]->smallest.DebugString());
      +      r.append(" .. ");
      +      r.append(files[i]->largest.DebugString());
      +      r.append("]\n");
           }
         }
         return r;
      @@ -540,8 +639,8 @@ class VersionSet::Builder {
                 const InternalKey& this_begin = v->files_[level][i]->smallest;
                 if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
                   fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
      -                    EscapeString(prev_end.Encode()).c_str(),
      -                    EscapeString(this_begin.Encode()).c_str());
      +                    prev_end.DebugString().c_str(),
      +                    this_begin.DebugString().c_str());
                   abort();
                 }
               }
      @@ -814,14 +913,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
         }
       }
       
      -static int64_t TotalFileSize(const std::vector& files) {
      -  int64_t sum = 0;
      -  for (size_t i = 0; i < files.size(); i++) {
      -    sum += files[i]->file_size;
      -  }
      -  return sum;
      -}
      -
       void VersionSet::Finalize(Version* v) {
         // Precomputed best level for next compaction
         int best_level = -1;
      @@ -967,7 +1058,8 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() {
         for (int level = 1; level < config::kNumLevels - 1; level++) {
           for (size_t i = 0; i < current_->files_[level].size(); i++) {
             const FileMetaData* f = current_->files_[level][i];
      -      GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps);
      +      current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
      +                                     &overlaps);
             const int64_t sum = TotalFileSize(overlaps);
             if (sum > result) {
               result = sum;
      @@ -977,27 +1069,6 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() {
         return result;
       }
       
      -// Store in "*inputs" all files in "level" that overlap [begin,end]
      -void VersionSet::GetOverlappingInputs(
      -    int level,
      -    const InternalKey& begin,
      -    const InternalKey& end,
      -    std::vector* inputs) {
      -  inputs->clear();
      -  Slice user_begin = begin.user_key();
      -  Slice user_end = end.user_key();
      -  const Comparator* user_cmp = icmp_.user_comparator();
      -  for (size_t i = 0; i < current_->files_[level].size(); i++) {
      -    FileMetaData* f = current_->files_[level][i];
      -    if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 ||
      -        user_cmp->Compare(f->smallest.user_key(), user_end) > 0) {
      -      // Either completely before or after range; skip it
      -    } else {
      -      inputs->push_back(f);
      -    }
      -  }
      -}
      -
       // Stores the minimal range that covers all entries in inputs in
       // *smallest, *largest.
       // REQUIRES: inputs is not empty
      @@ -1113,7 +1184,7 @@ Compaction* VersionSet::PickCompaction() {
           // Note that the next call will discard the file we placed in
           // c->inputs_[0] earlier and replace it with an overlapping set
           // which will include the picked file.
      -    GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]);
      +    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
           assert(!c->inputs_[0].empty());
         }
       
      @@ -1127,7 +1198,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
         InternalKey smallest, largest;
         GetRange(c->inputs_[0], &smallest, &largest);
       
      -  GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
      +  current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
       
         // Get entire range covered by compaction
         InternalKey all_start, all_limit;
      @@ -1137,12 +1208,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
         // changing the number of "level+1" files we pick up.
         if (!c->inputs_[1].empty()) {
           std::vector expanded0;
      -    GetOverlappingInputs(level, all_start, all_limit, &expanded0);
      +    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
           if (expanded0.size() > c->inputs_[0].size()) {
             InternalKey new_start, new_limit;
             GetRange(expanded0, &new_start, &new_limit);
             std::vector expanded1;
      -      GetOverlappingInputs(level+1, new_start, new_limit, &expanded1);
      +      current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
      +                                     &expanded1);
             if (expanded1.size() == c->inputs_[1].size()) {
               Log(options_->info_log,
                   "Expanding@%d %d+%d to %d+%d\n",
      @@ -1163,14 +1235,15 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
         // Compute the set of grandparent files that overlap this compaction
         // (parent == level+1; grandparent == level+2)
         if (level + 2 < config::kNumLevels) {
      -    GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_);
      +    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
      +                                   &c->grandparents_);
         }
       
         if (false) {
           Log(options_->info_log, "Compacting %d '%s' .. '%s'",
               level,
      -        EscapeString(smallest.Encode()).c_str(),
      -        EscapeString(largest.Encode()).c_str());
      +        smallest.DebugString().c_str(),
      +        largest.DebugString().c_str());
         }
       
         // Update the place where we will do the next compaction for this level.
      @@ -1183,14 +1256,26 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
       
       Compaction* VersionSet::CompactRange(
           int level,
      -    const InternalKey& begin,
      -    const InternalKey& end) {
      +    const InternalKey* begin,
      +    const InternalKey* end) {
         std::vector inputs;
      -  GetOverlappingInputs(level, begin, end, &inputs);
      +  current_->GetOverlappingInputs(level, begin, end, &inputs);
         if (inputs.empty()) {
           return NULL;
         }
       
      +  // Avoid compacting too much in one shot in case the range is large.
      +  const uint64_t limit = MaxFileSizeForLevel(level);
      +  uint64_t total = 0;
      +  for (int i = 0; i < inputs.size(); i++) {
      +    uint64_t s = inputs[i]->file_size;
      +    total += s;
      +    if (total >= limit) {
      +      inputs.resize(i + 1);
      +      break;
      +    }
      +  }
      +
         Compaction* c = new Compaction(level);
         c->input_version_ = current_;
         c->input_version_->Ref();
      diff --git a/db/version_set.h b/db/version_set.h
      index 2dbd948..b866b2a 100644
      --- a/db/version_set.h
      +++ b/db/version_set.h
      @@ -43,12 +43,17 @@ extern int FindFile(const InternalKeyComparator& icmp,
                           const Slice& key);
       
       // Returns true iff some file in "files" overlaps the user key range
      -// [smallest,largest].
      +// [*smallest,*largest].
      +// smallest==NULL represents a key smaller than all keys in the DB.
      +// largest==NULL represents a key largest than all keys in the DB.
      +// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
      +//           in sorted order.
       extern bool SomeFileOverlapsRange(
           const InternalKeyComparator& icmp,
      +    bool disjoint_sorted_files,
           const std::vector& files,
      -    const Slice& smallest_user_key,
      -    const Slice& largest_user_key);
      +    const Slice* smallest_user_key,
      +    const Slice* largest_user_key);
       
       class Version {
        public:
      @@ -77,11 +82,24 @@ class Version {
         void Ref();
         void Unref();
       
      +  void GetOverlappingInputs(
      +      int level,
      +      const InternalKey* begin,         // NULL means before all keys
      +      const InternalKey* end,           // NULL means after all keys
      +      std::vector* inputs);
      +
         // Returns true iff some file in the specified level overlaps
      -  // some part of [smallest_user_key,largest_user_key].
      +  // some part of [*smallest_user_key,*largest_user_key].
      +  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
      +  // largest_user_key==NULL represents a key largest than all keys in the DB.
         bool OverlapInLevel(int level,
      -                      const Slice& smallest_user_key,
      -                      const Slice& largest_user_key);
      +                      const Slice* smallest_user_key,
      +                      const Slice* largest_user_key);
      +
      +  // Return the level at which we should place a new memtable compaction
      +  // result that covers the range [smallest_user_key,largest_user_key].
      +  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
      +                                 const Slice& largest_user_key);
       
         int NumFiles(int level) const { return files_[level].size(); }
       
      @@ -192,8 +210,8 @@ class VersionSet {
         // the result.
         Compaction* CompactRange(
             int level,
      -      const InternalKey& begin,
      -      const InternalKey& end);
      +      const InternalKey* begin,
      +      const InternalKey* end);
       
         // Return the maximum overlapping data (in bytes) at next level for any
         // file at a level >= 1.
      @@ -232,12 +250,6 @@ class VersionSet {
       
         void Finalize(Version* v);
       
      -  void GetOverlappingInputs(
      -      int level,
      -      const InternalKey& begin,
      -      const InternalKey& end,
      -      std::vector* inputs);
      -
         void GetRange(const std::vector& inputs,
                       InternalKey* smallest,
                       InternalKey* largest);
      diff --git a/db/version_set_test.cc b/db/version_set_test.cc
      index ecfd62b..06f8bbd 100644
      --- a/db/version_set_test.cc
      +++ b/db/version_set_test.cc
      @@ -12,6 +12,9 @@ namespace leveldb {
       class FindFileTest {
        public:
         std::vector files_;
      +  bool disjoint_sorted_files_;
      +
      +  FindFileTest() : disjoint_sorted_files_(true) { }
       
         ~FindFileTest() {
           for (int i = 0; i < files_.size(); i++) {
      @@ -37,13 +40,20 @@ class FindFileTest {
       
         bool Overlaps(const char* smallest, const char* largest) {
           InternalKeyComparator cmp(BytewiseComparator());
      -    return SomeFileOverlapsRange(cmp, files_, smallest, largest);
      +    Slice s(smallest != NULL ? smallest : "");
      +    Slice l(largest != NULL ? largest : "");
      +    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
      +                                 (smallest != NULL ? &s : NULL),
      +                                 (largest != NULL ? &l : NULL));
         }
       };
       
       TEST(FindFileTest, Empty) {
         ASSERT_EQ(0, Find("foo"));
         ASSERT_TRUE(! Overlaps("a", "z"));
      +  ASSERT_TRUE(! Overlaps(NULL, "z"));
      +  ASSERT_TRUE(! Overlaps("a", NULL));
      +  ASSERT_TRUE(! Overlaps(NULL, NULL));
       }
       
       TEST(FindFileTest, Single) {
      @@ -67,6 +77,13 @@ TEST(FindFileTest, Single) {
         ASSERT_TRUE(Overlaps("p1", "z"));
         ASSERT_TRUE(Overlaps("q", "q"));
         ASSERT_TRUE(Overlaps("q", "q1"));
      +
      +  ASSERT_TRUE(! Overlaps(NULL, "j"));
      +  ASSERT_TRUE(! Overlaps("r", NULL));
      +  ASSERT_TRUE(Overlaps(NULL, "p"));
      +  ASSERT_TRUE(Overlaps(NULL, "p1"));
      +  ASSERT_TRUE(Overlaps("q", NULL));
      +  ASSERT_TRUE(Overlaps(NULL, NULL));
       }
       
       
      @@ -108,6 +125,26 @@ TEST(FindFileTest, Multiple) {
         ASSERT_TRUE(Overlaps("450", "500"));
       }
       
      +TEST(FindFileTest, MultipleNullBoundaries) {
      +  Add("150", "200");
      +  Add("200", "250");
      +  Add("300", "350");
      +  Add("400", "450");
      +  ASSERT_TRUE(! Overlaps(NULL, "149"));
      +  ASSERT_TRUE(! Overlaps("451", NULL));
      +  ASSERT_TRUE(Overlaps(NULL, NULL));
      +  ASSERT_TRUE(Overlaps(NULL, "150"));
      +  ASSERT_TRUE(Overlaps(NULL, "199"));
      +  ASSERT_TRUE(Overlaps(NULL, "200"));
      +  ASSERT_TRUE(Overlaps(NULL, "201"));
      +  ASSERT_TRUE(Overlaps(NULL, "400"));
      +  ASSERT_TRUE(Overlaps(NULL, "800"));
      +  ASSERT_TRUE(Overlaps("100", NULL));
      +  ASSERT_TRUE(Overlaps("200", NULL));
      +  ASSERT_TRUE(Overlaps("449", NULL));
      +  ASSERT_TRUE(Overlaps("450", NULL));
      +}
      +
       TEST(FindFileTest, OverlapSequenceChecks) {
         Add("200", "200", 5000, 3000);
         ASSERT_TRUE(! Overlaps("199", "199"));
      @@ -117,6 +154,24 @@ TEST(FindFileTest, OverlapSequenceChecks) {
         ASSERT_TRUE(Overlaps("200", "210"));
       }
       
      +TEST(FindFileTest, OverlappingFiles) {
      +  Add("150", "600");
      +  Add("400", "500");
      +  disjoint_sorted_files_ = false;
      +  ASSERT_TRUE(! Overlaps("100", "149"));
      +  ASSERT_TRUE(! Overlaps("601", "700"));
      +  ASSERT_TRUE(Overlaps("100", "150"));
      +  ASSERT_TRUE(Overlaps("100", "200"));
      +  ASSERT_TRUE(Overlaps("100", "300"));
      +  ASSERT_TRUE(Overlaps("100", "400"));
      +  ASSERT_TRUE(Overlaps("100", "500"));
      +  ASSERT_TRUE(Overlaps("375", "400"));
      +  ASSERT_TRUE(Overlaps("450", "450"));
      +  ASSERT_TRUE(Overlaps("450", "500"));
      +  ASSERT_TRUE(Overlaps("450", "700"));
      +  ASSERT_TRUE(Overlaps("600", "700"));
      +}
      +
       }
       
       int main(int argc, char** argv) {
      diff --git a/doc/index.html b/doc/index.html
      index 8d03c45..472f7cd 100644
      --- a/doc/index.html
      +++ b/doc/index.html
      @@ -193,7 +193,7 @@ that a read should operate on a particular version of the DB state.
       If ReadOptions::snapshot is NULL, the read will operate on an
       implicit snapshot of the current state.
       

      -Snapshots typically are created by the DB::GetSnapshot() method: +Snapshots are created by the DB::GetSnapshot() method:

         leveldb::ReadOptions options;
      @@ -208,26 +208,6 @@ Note that when a snapshot is no longer needed, it should be released
       using the DB::ReleaseSnapshot interface.  This allows the
       implementation to get rid of state that was being maintained just to
       support reading as of that snapshot.
      -

      -A Write operation can also return a snapshot that -represents the state of the database just after applying a particular -set of updates: -

      -

      -  leveldb::Snapshot* snapshot;
      -  leveldb::WriteOptions write_options;
      -  write_options.post_write_snapshot = &snapshot;
      -  leveldb::Status status = db->Write(write_options, ...);
      -  ... perform other mutations to db ...
      -
      -  leveldb::ReadOptions read_options;
      -  read_options.snapshot = snapshot;
      -  leveldb::Iterator* iter = db->NewIterator(read_options);
      -  ... read as of the state just after the Write call returned ...
      -  delete iter;
      -
      -  db->ReleaseSnapshot(snapshot);
      -

      Slice

      The return value of the it->key() and it->value() calls above diff --git a/include/leveldb/db.h b/include/leveldb/db.h index f945dd7..7fb2965 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -112,6 +112,8 @@ class DB { // where is an ASCII representation of a level number (e.g. "0"). // "leveldb.stats" - returns a multi-line string that describes statistics // about the internal operation of the DB. + // "leveldb.sstables" - returns a multi-line string that describes all + // of the sstables that make up the db contents. virtual bool GetProperty(const Slice& property, std::string* value) = 0; // For each i in [0,n-1], store in "sizes[i]", the approximate @@ -125,8 +127,17 @@ class DB { virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) = 0; - // Possible extensions: - // (1) Add a method to compact a range of keys + // Compact the underlying storage for the key range [*begin,*end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // + // begin==NULL is treated as a key before all keys in the database. + // end==NULL is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(NULL, NULL); + virtual void CompactRange(const Slice* begin, const Slice* end) = 0; private: // No copying allowed diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 1a8ff6b..a39d66f 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -160,6 +160,8 @@ class SequentialFile { // Read up to "n" bytes from the file. "scratch[0..n-1]" may be // written by this routine. Sets "*result" to the data that was // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. // If an error was encountered, returns a non-OK status. // // REQUIRES: External synchronization @@ -184,8 +186,10 @@ class RandomAccessFile { // Read up to "n" bytes from the file starting at "offset". // "scratch[0..n-1]" may be written by this routine. Sets "*result" // to the data that was read (including if fewer than "n" bytes were - // successfully read). If an error was encountered, returns a - // non-OK status. + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. // // Safe for concurrent use by multiple threads. virtual Status Read(uint64_t offset, size_t n, Slice* result, diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 381f228..84ac7fc 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -177,21 +177,8 @@ struct WriteOptions { // Default: false bool sync; - // If "post_write_snapshot" is non-NULL, and the write succeeds, - // *post_write_snapshot will be modified to point to a snapshot of - // the DB state immediately after this write. The caller must call - // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the - // snapshot is no longer needed. - // - // If "post_write_snapshot" is non-NULL, and the write fails, - // *post_write_snapshot will be set to NULL. - // - // Default: NULL - const Snapshot** post_write_snapshot; - WriteOptions() - : sync(false), - post_write_snapshot(NULL) { + : sync(false) { } }; diff --git a/util/coding.h b/util/coding.h index 8755968..c47b9d8 100644 --- a/util/coding.h +++ b/util/coding.h @@ -62,10 +62,10 @@ inline uint32_t DecodeFixed32(const char* ptr) { memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load return result; } else { - return ((static_cast(ptr[0])) - | (static_cast(ptr[1]) << 8) - | (static_cast(ptr[2]) << 16) - | (static_cast(ptr[3]) << 24)); + return ((static_cast(static_cast(ptr[0]))) + | (static_cast(static_cast(ptr[1])) << 8) + | (static_cast(static_cast(ptr[2])) << 16) + | (static_cast(static_cast(ptr[3])) << 24)); } } diff --git a/util/posix_logger.h b/util/posix_logger.h index 0dbdeaa..55428e5 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -1,5 +1,6 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// Author: sanjay@google.com (Sanjay Ghemawat) +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. // // Logger implementation that can be shared by all environments // where enough posix functionality is available. -- cgit v1.2.3 From 45b9940be332834440bd5299419f396e38085ebe Mon Sep 17 00:00:00 2001 From: "hans@chromium.org" Date: Mon, 31 Oct 2011 17:34:55 +0000 Subject: A number of fixes: - Replace raw slice comparison with a call to user comparator. Added test for custom comparators. - Fix end of namespace comments. - Fixed bug in picking inputs for a level-0 compaction. When finding overlapping files, the covered range may expand as files are added to the input set. We now correctly expand the range when this happens instead of continuing to use the old range. For example, suppose L0 contains files with the following ranges: F1: a .. d F2: c .. g F3: f .. j and the initial compaction target is F3. We used to search for range f..j which yielded {F2,F3}. However we now expand the range as soon as another file is added. In this case, when F2 is added, we expand the range to c..j and restart the search. That picks up file F1 as well. This change fixes a bug related to deleted keys showing up incorrectly after a compaction as described in Issue 44. (Sync with upstream @25072954) git-svn-id: http://leveldb.googlecode.com/svn/trunk@54 62dab493-f737-651d-591e-8d6aee1b9529 --- db/builder.cc | 2 +- db/builder.h | 2 +- db/corruption_test.cc | 2 +- db/db_bench.cc | 4 +- db/db_impl.cc | 4 +- db/db_impl.h | 2 +- db/db_iter.cc | 2 +- db/db_iter.h | 2 +- db/db_test.cc | 102 +++++++++++++++++++++++++++++++++++++++- db/dbformat.cc | 2 +- db/dbformat.h | 4 +- db/dbformat_test.cc | 2 +- db/filename.cc | 2 +- db/filename.h | 2 +- db/filename_test.cc | 2 +- db/log_format.h | 4 +- db/log_reader.cc | 4 +- db/log_reader.h | 4 +- db/log_test.cc | 4 +- db/log_writer.cc | 4 +- db/log_writer.h | 4 +- db/memtable.cc | 2 +- db/memtable.h | 2 +- db/repair.cc | 4 +- db/skiplist.h | 2 +- db/skiplist_test.cc | 2 +- db/snapshot.h | 2 +- db/table_cache.cc | 2 +- db/table_cache.h | 2 +- db/version_edit.cc | 2 +- db/version_edit.h | 2 +- db/version_edit_test.cc | 2 +- db/version_set.cc | 36 +++++++++----- db/version_set.h | 2 +- db/version_set_test.cc | 2 +- db/write_batch.cc | 4 +- db/write_batch_internal.h | 2 +- db/write_batch_test.cc | 2 +- doc/bench/db_bench_sqlite3.cc | 4 +- doc/bench/db_bench_tree_db.cc | 4 +- helpers/memenv/memenv.cc | 4 +- helpers/memenv/memenv.h | 2 +- helpers/memenv/memenv_test.cc | 2 +- include/leveldb/cache.h | 2 +- include/leveldb/comparator.h | 2 +- include/leveldb/db.h | 2 +- include/leveldb/env.h | 2 +- include/leveldb/iterator.h | 2 +- include/leveldb/options.h | 2 +- include/leveldb/slice.h | 2 +- include/leveldb/status.h | 2 +- include/leveldb/table.h | 2 +- include/leveldb/table_builder.h | 2 +- include/leveldb/write_batch.h | 2 +- port/atomic_pointer.h | 4 +- port/port_android.cc | 4 +- port/port_android.h | 4 +- port/port_example.h | 4 +- port/port_posix.cc | 4 +- table/block.cc | 2 +- table/block.h | 2 +- table/block_builder.cc | 2 +- table/block_builder.h | 2 +- table/format.cc | 2 +- table/format.h | 2 +- table/iterator.cc | 4 +- table/merger.cc | 4 +- table/merger.h | 2 +- table/table.cc | 2 +- table/table_builder.cc | 2 +- table/table_test.cc | 6 +-- table/two_level_iterator.cc | 4 +- table/two_level_iterator.h | 2 +- util/arena.cc | 2 +- util/arena.h | 2 +- util/arena_test.cc | 2 +- util/cache.cc | 2 +- util/cache_test.cc | 2 +- util/coding.cc | 2 +- util/coding.h | 2 +- util/coding_test.cc | 2 +- util/comparator.cc | 4 +- util/crc32c.cc | 4 +- util/crc32c.h | 4 +- util/crc32c_test.cc | 4 +- util/env.cc | 2 +- util/env_posix.cc | 4 +- util/env_test.cc | 2 +- util/hash.cc | 2 +- util/histogram.cc | 2 +- util/histogram.h | 2 +- util/logging.cc | 2 +- util/logging.h | 2 +- util/mutexlock.h | 2 +- util/options.cc | 2 +- util/posix_logger.h | 2 +- util/random.h | 2 +- util/status.cc | 2 +- util/testharness.cc | 4 +- util/testharness.h | 4 +- util/testutil.cc | 4 +- util/testutil.h | 4 +- 102 files changed, 258 insertions(+), 146 deletions(-) diff --git a/db/builder.cc b/db/builder.cc index 34a7b87..f419882 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -85,4 +85,4 @@ Status BuildTable(const std::string& dbname, return s; } -} +} // namespace leveldb diff --git a/db/builder.h b/db/builder.h index b2aeabf..62431fc 100644 --- a/db/builder.h +++ b/db/builder.h @@ -29,6 +29,6 @@ extern Status BuildTable(const std::string& dbname, Iterator* iter, FileMetaData* meta); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 1edcd84..31b2d5f 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -352,7 +352,7 @@ TEST(CorruptionTest, UnrelatedKeys) { ASSERT_EQ(Value(1000, &tmp2).ToString(), v); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/db/db_bench.cc b/db/db_bench.cc index cf9bb65..bbfd618 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -288,7 +288,7 @@ struct ThreadState { } }; -} +} // namespace class Benchmark { private: @@ -829,7 +829,7 @@ class Benchmark { } }; -} +} // namespace leveldb int main(int argc, char** argv) { FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; diff --git a/db/db_impl.cc b/db/db_impl.cc index 56182a0..b4df80d 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -985,7 +985,7 @@ static void CleanupIteratorState(void* arg1, void* arg2) { state->mu->Unlock(); delete state; } -} +} // namespace Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, SequenceNumber* latest_snapshot) { @@ -1378,4 +1378,4 @@ Status DestroyDB(const std::string& dbname, const Options& options) { return result; } -} +} // namespace leveldb diff --git a/db/db_impl.h b/db/db_impl.h index ab03181..fc40d1e 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -187,6 +187,6 @@ extern Options SanitizeOptions(const std::string& db, const InternalKeyComparator* icmp, const Options& src); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/db/db_iter.cc b/db/db_iter.cc index 8849f92..87dca2d 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -296,4 +296,4 @@ Iterator* NewDBIterator( return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); } -} +} // namespace leveldb diff --git a/db/db_iter.h b/db/db_iter.h index 195f3d3..d9e1b17 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -21,6 +21,6 @@ extern Iterator* NewDBIterator( Iterator* internal_iter, const SequenceNumber& sequence); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/db/db_test.cc b/db/db_test.cc index ab71c51..f1cb949 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -136,6 +136,33 @@ class DBTest { return result; } + // Return a string that contains all key,value pairs in order, + // formatted like "(k1->v1)(k2->v2)". + std::string Contents() { + std::vector forward; + std::string result; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string s = IterStatus(iter); + result.push_back('('); + result.append(s); + result.push_back(')'); + forward.push_back(s); + } + + // Check reverse iteration results are the reverse of forward results + int matched = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_LT(matched, forward.size()); + ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); + matched++; + } + ASSERT_EQ(matched, forward.size()); + + delete iter; + return result; + } + std::string AllEntriesFor(const Slice& user_key) { Iterator* iter = dbfull()->TEST_NewInternalIterator(); InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); @@ -1048,6 +1075,49 @@ TEST(DBTest, OverlapInLevel0) { ASSERT_EQ("NOT_FOUND", Get("600")); } +TEST(DBTest, L0_CompactionBug_Issue44_a) { + Reopen(); + ASSERT_OK(Put("b", "v")); + Reopen(); + ASSERT_OK(Delete("b")); + ASSERT_OK(Delete("a")); + Reopen(); + ASSERT_OK(Delete("a")); + Reopen(); + ASSERT_OK(Put("a", "v")); + Reopen(); + Reopen(); + ASSERT_EQ("(a->v)", Contents()); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(a->v)", Contents()); +} + +TEST(DBTest, L0_CompactionBug_Issue44_b) { + Reopen(); + Put("",""); + Reopen(); + Delete("e"); + Put("",""); + Reopen(); + Put("c", "cv"); + Reopen(); + Put("",""); + Reopen(); + Put("",""); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + Reopen(); + Put("d","dv"); + Reopen(); + Put("",""); + Reopen(); + Delete("d"); + Delete("b"); + Reopen(); + ASSERT_EQ("(->)(c->cv)", Contents()); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(->)(c->cv)", Contents()); +} + TEST(DBTest, ComparatorCheck) { class NewComparator : public Comparator { public: @@ -1071,6 +1141,34 @@ TEST(DBTest, ComparatorCheck) { << s.ToString(); } +TEST(DBTest, CustomComparator) { + class NumberComparator : public Comparator { + public: + virtual const char* Name() const { return "test.NumberComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return (strtol(a.ToString().c_str(), NULL, 0) - + strtol(b.ToString().c_str(), NULL, 0)); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const {} + virtual void FindShortSuccessor(std::string* key) const {} + }; + NumberComparator cmp; + Options new_options; + new_options.create_if_missing = true; + new_options.comparator = &cmp; + DestroyAndReopen(&new_options); + ASSERT_OK(Put("10", "ten")); + ASSERT_OK(Put("0x14", "twenty")); + for (int i = 0; i < 2; i++) { + ASSERT_EQ("ten", Get("10")); + ASSERT_EQ("ten", Get("0xa")); + ASSERT_EQ("twenty", Get("20")); + ASSERT_EQ("twenty", Get("0x14")); + Compact("0", "9999"); + fprintf(stderr, "ss\n%s\n", DumpSSTableList().c_str()); + } +} + TEST(DBTest, ManualCompaction) { ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Need to update this test to match kMaxMemCompactLevel"; @@ -1207,7 +1305,7 @@ static void MTThreadBody(void* arg) { fprintf(stderr, "... stopping thread %d after %d ops\n", t->id, int(counter)); } -} +} // namespace TEST(DBTest, MultiThreaded) { // Initialize state @@ -1525,7 +1623,7 @@ void BM_LogAndApply(int iters, int num_base_files) { buf, iters, us, ((float)us) / iters); } -} +} // namespace leveldb int main(int argc, char** argv) { if (argc > 1 && std::string(argv[1]) == "--benchmark") { diff --git a/db/dbformat.cc b/db/dbformat.cc index 4fb3531..4594a57 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -115,4 +115,4 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { end_ = dst; } -} +} // namespace leveldb diff --git a/db/dbformat.h b/db/dbformat.h index d046990..044717d 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -37,7 +37,7 @@ static const int kL0_StopWritesTrigger = 12; // space if the same key space is being repeatedly overwritten. static const int kMaxMemCompactLevel = 2; -} +} // namespace config class InternalKey; @@ -210,6 +210,6 @@ inline LookupKey::~LookupKey() { if (start_ != space_) delete[] start_; } -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 57c5578..5d82f5d 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -105,7 +105,7 @@ TEST(FormatTest, InternalKeyShortestSuccessor) { ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/db/filename.cc b/db/filename.cc index b3a917c..24fd140 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -132,4 +132,4 @@ Status SetCurrentFile(Env* env, const std::string& dbname, return s; } -} +} // namespace leveldb diff --git a/db/filename.h b/db/filename.h index e9ec8a7..d5d09b1 100644 --- a/db/filename.h +++ b/db/filename.h @@ -75,6 +75,6 @@ extern Status SetCurrentFile(Env* env, const std::string& dbname, uint64_t descriptor_number); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/db/filename_test.cc b/db/filename_test.cc index 2f61e8d..47353d6 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -115,7 +115,7 @@ TEST(FileNameTest, Construction) { ASSERT_EQ(kTempFile, type); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/db/log_format.h b/db/log_format.h index 353eff8..2690cb9 100644 --- a/db/log_format.h +++ b/db/log_format.h @@ -29,7 +29,7 @@ static const int kBlockSize = 32768; // Header is checksum (4 bytes), type (1 byte), length (2 bytes). static const int kHeaderSize = 4 + 1 + 2; -} -} +} // namespace log +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/db/log_reader.cc b/db/log_reader.cc index fcb3aa7..b35f115 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -255,5 +255,5 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { } } -} -} +} // namespace log +} // namespace leveldb diff --git a/db/log_reader.h b/db/log_reader.h index 61cc414..82d4bee 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -102,7 +102,7 @@ class Reader { void operator=(const Reader&); }; -} -} +} // namespace log +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/db/log_test.cc b/db/log_test.cc index 06e0893..4c5cf87 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -492,8 +492,8 @@ TEST(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } -} -} +} // namespace log +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/db/log_writer.cc b/db/log_writer.cc index 0887f6c..2da99ac 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -99,5 +99,5 @@ Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { return s; } -} -} +} // namespace log +} // namespace leveldb diff --git a/db/log_writer.h b/db/log_writer.h index d3cf27d..a3a954d 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -42,7 +42,7 @@ class Writer { void operator=(const Writer&); }; -} -} +} // namespace log +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/db/memtable.cc b/db/memtable.cc index 4555abb..bfec0a7 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -142,4 +142,4 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { return false; } -} +} // namespace leveldb diff --git a/db/memtable.h b/db/memtable.h index 1898b5e..92e90bb 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -86,6 +86,6 @@ class MemTable { void operator=(const MemTable&); }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/db/repair.cc b/db/repair.cc index 5bcdb56..511c66b 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -377,11 +377,11 @@ class Repairer { fname.c_str(), s.ToString().c_str()); } }; -} +} // namespace Status RepairDB(const std::string& dbname, const Options& options) { Repairer repairer(dbname, options); return repairer.Run(); } -} +} // namespace leveldb diff --git a/db/skiplist.h b/db/skiplist.h index be39354..0481575 100644 --- a/db/skiplist.h +++ b/db/skiplist.h @@ -375,4 +375,4 @@ bool SkipList::Contains(const Key& key) const { } } -} +} // namespace leveldb diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc index 2bd8d22..c78f4b4 100644 --- a/db/skiplist_test.cc +++ b/db/skiplist_test.cc @@ -371,7 +371,7 @@ TEST(SkipTest, Concurrent3) { RunConcurrent(3); } TEST(SkipTest, Concurrent4) { RunConcurrent(4); } TEST(SkipTest, Concurrent5) { RunConcurrent(5); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/db/snapshot.h b/db/snapshot.h index a08dbd3..e7f8fd2 100644 --- a/db/snapshot.h +++ b/db/snapshot.h @@ -61,6 +61,6 @@ class SnapshotList { SnapshotImpl list_; }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/db/table_cache.cc b/db/table_cache.cc index 325d707..cae79bd 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -92,4 +92,4 @@ void TableCache::Evict(uint64_t file_number) { cache_->Erase(Slice(buf, sizeof(buf))); } -} +} // namespace leveldb diff --git a/db/table_cache.h b/db/table_cache.h index 5376194..0f3c73b 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -45,6 +45,6 @@ class TableCache { Cache* cache_; }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/db/version_edit.cc b/db/version_edit.cc index 9891c32..f10a2d5 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -263,4 +263,4 @@ std::string VersionEdit::DebugString() const { return r; } -} +} // namespace leveldb diff --git a/db/version_edit.h b/db/version_edit.h index a069893..eaef77b 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -102,6 +102,6 @@ class VersionEdit { std::vector< std::pair > new_files_; }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 67959f7..280310b 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -39,7 +39,7 @@ TEST(VersionEditTest, EncodeDecode) { TestEncodeDecode(edit); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/db/version_set.cc b/db/version_set.cc index 8b96af0..7cf5197 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -61,7 +61,7 @@ std::string IntSetToString(const std::set& s) { result += "}"; return result; } -} +} // namespace Version::~Version() { assert(refs_ == 0); @@ -253,7 +253,8 @@ void Version::AddIterators(const ReadOptions& options, // If "*iter" points at a value or deletion for user_key, store // either the value, or a NotFound error and return true. // Else return false. -static bool GetValue(Iterator* iter, const Slice& user_key, +static bool GetValue(const Comparator* cmp, + Iterator* iter, const Slice& user_key, std::string* value, Status* s) { if (!iter->Valid()) { @@ -264,7 +265,7 @@ static bool GetValue(Iterator* iter, const Slice& user_key, *s = Status::Corruption("corrupted key for ", user_key); return true; } - if (parsed_key.user_key != user_key) { + if (cmp->Compare(parsed_key.user_key, user_key) != 0) { return false; } switch (parsed_key.type) { @@ -360,7 +361,7 @@ Status Version::Get(const ReadOptions& options, f->number, f->file_size); iter->Seek(ikey); - const bool done = GetValue(iter, user_key, value, &s); + const bool done = GetValue(ucmp, iter, user_key, value, &s); if (!iter->status().ok()) { s = iter->status(); delete iter; @@ -450,16 +451,29 @@ void Version::GetOverlappingInputs( user_end = end->user_key(); } const Comparator* user_cmp = vset_->icmp_.user_comparator(); - for (size_t i = 0; i < files_[level].size(); i++) { - FileMetaData* f = files_[level][i]; - if (begin != NULL && - user_cmp->Compare(f->largest.user_key(), user_begin) < 0) { + for (size_t i = 0; i < files_[level].size(); ) { + FileMetaData* f = files_[level][i++]; + const Slice file_start = f->smallest.user_key(); + const Slice file_limit = f->largest.user_key(); + if (begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) { // "f" is completely before specified range; skip it - } else if (end != NULL && - user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { + } else if (end != NULL && user_cmp->Compare(file_start, user_end) > 0) { // "f" is completely after specified range; skip it } else { inputs->push_back(f); + if (level == 0) { + // Level-0 files may overlap each other. So check if the newly + // added file has expanded the range. If so, restart search. + if (begin != NULL && user_cmp->Compare(file_start, user_begin) < 0) { + user_begin = file_start; + inputs->clear(); + i = 0; + } else if (end != NULL && user_cmp->Compare(file_limit, user_end) > 0) { + user_end = file_limit; + inputs->clear(); + i = 0; + } + } } } } @@ -1369,4 +1383,4 @@ void Compaction::ReleaseInputs() { } } -} +} // namespace leveldb diff --git a/db/version_set.h b/db/version_set.h index b866b2a..572602e 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -365,6 +365,6 @@ class Compaction { size_t level_ptrs_[config::kNumLevels]; }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 06f8bbd..501e34d 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -172,7 +172,7 @@ TEST(FindFileTest, OverlappingFiles) { ASSERT_TRUE(Overlaps("600", "700")); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/db/write_batch.cc b/db/write_batch.cc index 4e1e899..a0e812f 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -120,7 +120,7 @@ class MemTableInserter : public WriteBatch::Handler { sequence_++; } }; -} +} // namespace Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* memtable) { @@ -135,4 +135,4 @@ void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { b->rep_.assign(contents.data(), contents.size()); } -} +} // namespace leveldb diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 6d65eed..49aeb84 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -41,7 +41,7 @@ class WriteBatchInternal { static Status InsertInto(const WriteBatch* batch, MemTable* memtable); }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 73d68fd..1ee6d7b 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -82,7 +82,7 @@ TEST(WriteBatchTest, Corruption) { PrintContents(&batch)); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/doc/bench/db_bench_sqlite3.cc b/doc/bench/db_bench_sqlite3.cc index e11db52..6951a14 100644 --- a/doc/bench/db_bench_sqlite3.cc +++ b/doc/bench/db_bench_sqlite3.cc @@ -155,7 +155,7 @@ static Slice TrimSpace(Slice s) { return Slice(s.data() + start, limit - start); } -} +} // namespace class Benchmark { private: @@ -652,7 +652,7 @@ class Benchmark { }; -} +} // namespace leveldb int main(int argc, char** argv) { for (int i = 1; i < argc; i++) { diff --git a/doc/bench/db_bench_tree_db.cc b/doc/bench/db_bench_tree_db.cc index d42e306..214d9b7 100644 --- a/doc/bench/db_bench_tree_db.cc +++ b/doc/bench/db_bench_tree_db.cc @@ -124,7 +124,7 @@ static Slice TrimSpace(Slice s) { return Slice(s.data() + start, limit - start); } -} +} // namespace class Benchmark { private: @@ -467,7 +467,7 @@ class Benchmark { } }; -} +} // namespace leveldb int main(int argc, char** argv) { for (int i = 1; i < argc; i++) { diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc index dab80fe..2082083 100644 --- a/helpers/memenv/memenv.cc +++ b/helpers/memenv/memenv.cc @@ -365,10 +365,10 @@ class InMemoryEnv : public EnvWrapper { FileSystem file_map_; // Protected by mutex_. }; -} +} // namespace Env* NewMemEnv(Env* base_env) { return new InMemoryEnv(base_env); } -} +} // namespace leveldb diff --git a/helpers/memenv/memenv.h b/helpers/memenv/memenv.h index 835b944..03b88de 100644 --- a/helpers/memenv/memenv.h +++ b/helpers/memenv/memenv.h @@ -15,6 +15,6 @@ class Env; // *base_env must remain live while the result is in use. Env* NewMemEnv(Env* base_env); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_ diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc index 3791dc3..a44310f 100644 --- a/helpers/memenv/memenv_test.cc +++ b/helpers/memenv/memenv_test.cc @@ -225,7 +225,7 @@ TEST(MemEnvTest, DBTest) { delete db; } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h index 79196d1..5e3b476 100644 --- a/include/leveldb/cache.h +++ b/include/leveldb/cache.h @@ -94,6 +94,6 @@ class Cache { void operator=(const Cache&); }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h index c215fac..556b984 100644 --- a/include/leveldb/comparator.h +++ b/include/leveldb/comparator.h @@ -58,6 +58,6 @@ class Comparator { // must not be deleted. extern const Comparator* BytewiseComparator(); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 7fb2965..c1182b7 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -155,6 +155,6 @@ Status DestroyDB(const std::string& name, const Options& options); // on a database that contains important information. Status RepairDB(const std::string& dbname, const Options& options); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_DB_H_ diff --git a/include/leveldb/env.h b/include/leveldb/env.h index a39d66f..2720667 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -318,6 +318,6 @@ class EnvWrapper : public Env { Env* target_; }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h index 6821d85..ad543eb 100644 --- a/include/leveldb/iterator.h +++ b/include/leveldb/iterator.h @@ -95,6 +95,6 @@ extern Iterator* NewEmptyIterator(); // Return an empty iterator with the specified status. extern Iterator* NewErrorIterator(const Status& status); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 84ac7fc..79111a0 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -182,6 +182,6 @@ struct WriteOptions { } }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h index 7c66d1b..74ea8fa 100644 --- a/include/leveldb/slice.h +++ b/include/leveldb/slice.h @@ -103,7 +103,7 @@ inline int Slice::compare(const Slice& b) const { return r; } -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_ diff --git a/include/leveldb/status.h b/include/leveldb/status.h index 8fe4442..3355fac 100644 --- a/include/leveldb/status.h +++ b/include/leveldb/status.h @@ -95,6 +95,6 @@ inline void Status::operator=(const Status& s) { } } -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_ diff --git a/include/leveldb/table.h b/include/leveldb/table.h index 35e5d22..0cbdd40 100644 --- a/include/leveldb/table.h +++ b/include/leveldb/table.h @@ -65,6 +65,6 @@ class Table { void operator=(const Table&); }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_ diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h index 23851de..9ac0868 100644 --- a/include/leveldb/table_builder.h +++ b/include/leveldb/table_builder.h @@ -86,6 +86,6 @@ class TableBuilder { void operator=(const TableBuilder&); }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_ diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h index b4446c2..ee9aab6 100644 --- a/include/leveldb/write_batch.h +++ b/include/leveldb/write_batch.h @@ -59,6 +59,6 @@ class WriteBatch { // Intentionally copyable }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h index c20b1bd..35ae550 100644 --- a/port/atomic_pointer.h +++ b/port/atomic_pointer.h @@ -138,7 +138,7 @@ class AtomicPointer { #undef ARCH_CPU_X86_FAMILY #undef ARCH_CPU_ARM_FAMILY -} // namespace leveldb::port -} // namespace leveldb +} // namespace port +} // namespace leveldb #endif // PORT_ATOMIC_POINTER_H_ diff --git a/port/port_android.cc b/port/port_android.cc index 240e9ca..815abf2 100644 --- a/port/port_android.cc +++ b/port/port_android.cc @@ -60,5 +60,5 @@ void CondVar::SignalAll() { PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); } -} -} +} // namespace port +} // namespace leveldb diff --git a/port/port_android.h b/port/port_android.h index d68b6c0..64cdcbf 100644 --- a/port/port_android.h +++ b/port/port_android.h @@ -150,7 +150,7 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } -} -} +} // namespace port +} // namespace leveldb #endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/port/port_example.h b/port/port_example.h index 6bd9b49..036c7d1 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -119,7 +119,7 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length, // The concatenation of all "data[0,n-1]" fragments is the heap profile. extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); -} -} +} // namespace port +} // namespace leveldb #endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/port/port_posix.cc b/port/port_posix.cc index e75da8b..c44cc99 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -46,5 +46,5 @@ void CondVar::SignalAll() { PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); } -} -} +} // namespace port +} // namespace leveldb diff --git a/table/block.cc b/table/block.cc index c20bb38..40aa318 100644 --- a/table/block.cc +++ b/table/block.cc @@ -260,4 +260,4 @@ Iterator* Block::NewIterator(const Comparator* cmp) { } } -} +} // namespace leveldb diff --git a/table/block.h b/table/block.h index cdf0598..9eb6f02 100644 --- a/table/block.h +++ b/table/block.h @@ -38,6 +38,6 @@ class Block { class Iter; }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_TABLE_BLOCK_H_ diff --git a/table/block_builder.cc b/table/block_builder.cc index d2ffa21..db660cd 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -106,4 +106,4 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) { counter_++; } -} +} // namespace leveldb diff --git a/table/block_builder.h b/table/block_builder.h index bf92a0f..5b545bd 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -52,6 +52,6 @@ class BlockBuilder { void operator=(const BlockBuilder&); }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_ diff --git a/table/format.cc b/table/format.cc index ba7838c..23728d8 100644 --- a/table/format.cc +++ b/table/format.cc @@ -132,4 +132,4 @@ Status ReadBlock(RandomAccessFile* file, return Status::OK(); } -} +} // namespace leveldb diff --git a/table/format.h b/table/format.h index a6ab964..2a3e1ac 100644 --- a/table/format.h +++ b/table/format.h @@ -98,6 +98,6 @@ inline BlockHandle::BlockHandle() size_(~static_cast(0)) { } -} +} // namespace leveldb #endif // STORAGE_LEVELDB_TABLE_FORMAT_H_ diff --git a/table/iterator.cc b/table/iterator.cc index 33bc8a2..3d1c87f 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -54,7 +54,7 @@ class EmptyIterator : public Iterator { private: Status status_; }; -} +} // namespace Iterator* NewEmptyIterator() { return new EmptyIterator(Status::OK()); @@ -64,4 +64,4 @@ Iterator* NewErrorIterator(const Status& status) { return new EmptyIterator(status); } -} +} // namespace leveldb diff --git a/table/merger.cc b/table/merger.cc index 6ce06bb..2dde4dc 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -181,7 +181,7 @@ void MergingIterator::FindLargest() { } current_ = largest; } -} +} // namespace Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { assert(n >= 0); @@ -194,4 +194,4 @@ Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { } } -} +} // namespace leveldb diff --git a/table/merger.h b/table/merger.h index 71d9dc5..91ddd80 100644 --- a/table/merger.h +++ b/table/merger.h @@ -21,6 +21,6 @@ class Iterator; extern Iterator* NewMergingIterator( const Comparator* comparator, Iterator** children, int n); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_TABLE_MERGER_H_ diff --git a/table/table.cc b/table/table.cc index 9820753..5f9238e 100644 --- a/table/table.cc +++ b/table/table.cc @@ -172,4 +172,4 @@ uint64_t Table::ApproximateOffsetOf(const Slice& key) const { return result; } -} +} // namespace leveldb diff --git a/table/table_builder.cc b/table/table_builder.cc index 7ec7ad2..682ce5b 100644 --- a/table/table_builder.cc +++ b/table/table_builder.cc @@ -224,4 +224,4 @@ uint64_t TableBuilder::FileSize() const { return rep_->offset; } -} +} // namespace leveldb diff --git a/table/table_test.cc b/table/table_test.cc index c69a386..cd85b4b 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -60,7 +60,7 @@ class ReverseKeyComparator : public Comparator { *key = Reverse(s); } }; -} +} // namespace static ReverseKeyComparator reverse_key_comparator; static void Increment(const Comparator* cmp, std::string* key) { @@ -85,7 +85,7 @@ struct STLLessThan { return cmp->Compare(Slice(a), Slice(b)) < 0; } }; -} +} // namespace class StringSink: public WritableFile { public: @@ -847,7 +847,7 @@ TEST(TableTest, ApproximateOffsetOfCompressed) { ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 24a1241..7822eba 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -169,7 +169,7 @@ void TwoLevelIterator::InitDataBlock() { } } -} +} // namespace Iterator* NewTwoLevelIterator( Iterator* index_iter, @@ -179,4 +179,4 @@ Iterator* NewTwoLevelIterator( return new TwoLevelIterator(index_iter, block_function, arg, options); } -} +} // namespace leveldb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index 5909e2b..629ca34 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -29,6 +29,6 @@ extern Iterator* NewTwoLevelIterator( void* arg, const ReadOptions& options); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_ diff --git a/util/arena.cc b/util/arena.cc index 40ab99d..9551d6a 100644 --- a/util/arena.cc +++ b/util/arena.cc @@ -65,4 +65,4 @@ char* Arena::AllocateNewBlock(size_t block_bytes) { return result; } -} +} // namespace leveldb diff --git a/util/arena.h b/util/arena.h index fcb5d5b..8f7dde2 100644 --- a/util/arena.h +++ b/util/arena.h @@ -63,6 +63,6 @@ inline char* Arena::Allocate(size_t bytes) { return AllocateFallback(bytes); } -} +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_ARENA_H_ diff --git a/util/arena_test.cc b/util/arena_test.cc index c33b552..63d1778 100644 --- a/util/arena_test.cc +++ b/util/arena_test.cc @@ -61,7 +61,7 @@ TEST(ArenaTest, Simple) { } } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/util/cache.cc b/util/cache.cc index 7d1ebc1..24f1f63 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -325,4 +325,4 @@ Cache* NewLRUCache(size_t capacity) { return new ShardedLRUCache(capacity); } -} +} // namespace leveldb diff --git a/util/cache_test.cc b/util/cache_test.cc index 8a7f1c4..4371671 100644 --- a/util/cache_test.cc +++ b/util/cache_test.cc @@ -179,7 +179,7 @@ TEST(CacheTest, NewId) { ASSERT_NE(a, b); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/util/coding.cc b/util/coding.cc index 14f21f7..dbd7a65 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -191,4 +191,4 @@ bool GetLengthPrefixedSlice(Slice* input, Slice* result) { } } -} +} // namespace leveldb diff --git a/util/coding.h b/util/coding.h index c47b9d8..3993c4a 100644 --- a/util/coding.h +++ b/util/coding.h @@ -99,6 +99,6 @@ inline const char* GetVarint32Ptr(const char* p, return GetVarint32PtrFallback(p, limit, value); } -} +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/util/coding_test.cc b/util/coding_test.cc index a8dba04..4cc856c 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -166,7 +166,7 @@ TEST(Coding, Strings) { ASSERT_EQ("", input.ToString()); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/util/comparator.cc b/util/comparator.cc index 2d7544d..cfb49ce 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -63,11 +63,11 @@ class BytewiseComparatorImpl : public Comparator { // *key is a run of 0xffs. Leave it alone. } }; -} +} // namespace static const BytewiseComparatorImpl bytewise; const Comparator* BytewiseComparator() { return &bytewise; } -} +} // namespace leveldb diff --git a/util/crc32c.cc b/util/crc32c.cc index 28c2401..6db9e77 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -328,5 +328,5 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) { return l ^ 0xffffffffu; } -} -} +} // namespace crc32c +} // namespace leveldb diff --git a/util/crc32c.h b/util/crc32c.h index 938d8ff..1d7e5c0 100644 --- a/util/crc32c.h +++ b/util/crc32c.h @@ -39,7 +39,7 @@ inline uint32_t Unmask(uint32_t masked_crc) { return ((rot >> 17) | (rot << 15)); } -} -} +} // namespace crc32c +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_CRC32C_H_ diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc index ba9e804..4b957ee 100644 --- a/util/crc32c_test.cc +++ b/util/crc32c_test.cc @@ -64,8 +64,8 @@ TEST(CRC, Mask) { ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); } -} -} +} // namespace crc32c +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/util/env.cc b/util/env.cc index 79e493e..594811b 100644 --- a/util/env.cc +++ b/util/env.cc @@ -79,4 +79,4 @@ Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { EnvWrapper::~EnvWrapper() { } -} +} // namespace leveldb diff --git a/util/env_posix.cc b/util/env_posix.cc index 5127c89..cc73348 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -553,7 +553,7 @@ void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { pthread_create(&t, NULL, &StartThreadWrapper, state)); } -} +} // namespace static pthread_once_t once = PTHREAD_ONCE_INIT; static Env* default_env; @@ -564,4 +564,4 @@ Env* Env::Default() { return default_env; } -} +} // namespace leveldb diff --git a/util/env_test.cc b/util/env_test.cc index 3c253be..3f8a8a2 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -95,7 +95,7 @@ TEST(EnvPosixTest, StartThread) { ASSERT_EQ(state.val, 3); } -} +} // namespace leveldb int main(int argc, char** argv) { return leveldb::test::RunAllTests(); diff --git a/util/hash.cc b/util/hash.cc index d19afd1..ba18180 100644 --- a/util/hash.cc +++ b/util/hash.cc @@ -42,4 +42,4 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) { } -} +} // namespace leveldb diff --git a/util/histogram.cc b/util/histogram.cc index 12ec3cf..bb95f58 100644 --- a/util/histogram.cc +++ b/util/histogram.cc @@ -136,4 +136,4 @@ std::string Histogram::ToString() const { return r; } -} +} // namespace leveldb diff --git a/util/histogram.h b/util/histogram.h index 32484c0..1ef9f3c 100644 --- a/util/histogram.h +++ b/util/histogram.h @@ -37,6 +37,6 @@ class Histogram { double StandardDeviation() const; }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ diff --git a/util/logging.cc b/util/logging.cc index 760d335..22cf278 100644 --- a/util/logging.cc +++ b/util/logging.cc @@ -78,4 +78,4 @@ bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { return (digits > 0); } -} +} // namespace leveldb diff --git a/util/logging.h b/util/logging.h index 1cd0a4b..b0c5da8 100644 --- a/util/logging.h +++ b/util/logging.h @@ -42,6 +42,6 @@ extern bool ConsumeChar(Slice* in, char c); // unspecified state. extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); -} +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_LOGGING_H_ diff --git a/util/mutexlock.h b/util/mutexlock.h index 05fe279..c3f3306 100644 --- a/util/mutexlock.h +++ b/util/mutexlock.h @@ -33,7 +33,7 @@ class MutexLock { void operator=(const MutexLock&); }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ diff --git a/util/options.cc b/util/options.cc index 0ea5c98..bb97838 100644 --- a/util/options.cc +++ b/util/options.cc @@ -25,4 +25,4 @@ Options::Options() } -} +} // namespace leveldb diff --git a/util/posix_logger.h b/util/posix_logger.h index 55428e5..9741b1a 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -93,6 +93,6 @@ class PosixLogger : public Logger { } }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ diff --git a/util/random.h b/util/random.h index d886b4e..0753824 100644 --- a/util/random.h +++ b/util/random.h @@ -54,6 +54,6 @@ class Random { } }; -} +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_RANDOM_H_ diff --git a/util/status.cc b/util/status.cc index 02051a9..a44f35b 100644 --- a/util/status.cc +++ b/util/status.cc @@ -72,4 +72,4 @@ std::string Status::ToString() const { } } -} +} // namespace leveldb diff --git a/util/testharness.cc b/util/testharness.cc index 6f42700..eb1bdd5 100644 --- a/util/testharness.cc +++ b/util/testharness.cc @@ -73,5 +73,5 @@ int RandomSeed() { return result; } -} -} +} // namespace test +} // namespace leveldb diff --git a/util/testharness.h b/util/testharness.h index 6f1a9c3..da4fe68 100644 --- a/util/testharness.h +++ b/util/testharness.h @@ -132,7 +132,7 @@ void TCONCAT(_Test_,name)::_Run() extern bool RegisterTest(const char* base, const char* name, void (*func)()); -} -} +} // namespace test +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_ diff --git a/util/testutil.cc b/util/testutil.cc index 8d6cf3c..538d095 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -47,5 +47,5 @@ extern Slice CompressibleString(Random* rnd, double compressed_fraction, return Slice(*dst); } -} -} +} // namespace test +} // namespace leveldb diff --git a/util/testutil.h b/util/testutil.h index a150c1a..824e655 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -47,7 +47,7 @@ class ErrorEnv : public EnvWrapper { } }; -} -} +} // namespace test +} // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_ -- cgit v1.2.3 From ac271d8b01dd3a1b6d57b1a4a0e0d28e00f67780 Mon Sep 17 00:00:00 2001 From: "hans@chromium.org" Date: Mon, 14 Nov 2011 17:21:42 +0000 Subject: Pass system's CFLAGS, remove exit time destructor, sstable bug fix. - Pass system's values of CFLAGS,LDFLAGS. Don't override OPT if it's already set. Original patch by Alessio Treglia : http://code.google.com/p/leveldb/issues/detail?id=27#c6 - Remove 1 exit time destructor from leveldb. See http://crbug.com/101600 - Fix problem where sstable building code would pass an internal key to the user comparator. (Sync with uptream at 25436817.) git-svn-id: http://leveldb.googlecode.com/svn/trunk@55 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 8 ++++---- db/db_test.cc | 48 ++++++++++++++++++++++++++++++++++++------------ db/dbformat.cc | 14 ++++++++------ util/comparator.cc | 7 +++++-- 4 files changed, 53 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 18386f2..8385d44 100644 --- a/Makefile +++ b/Makefile @@ -8,9 +8,9 @@ CC = g++ # Uncomment exactly one of the lines labelled (A), (B), and (C) below # to switch between compilation modes. -OPT = -O2 -DNDEBUG # (A) Production use (optimized mode) -# OPT = -g2 # (B) Debug mode, w/ full line-level debugging symbols -# OPT = -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols +OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode) +# OPT ?= -g2 # (B) Debug mode, w/ full line-level debugging symbols +# OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols #----------------------------------------------- # detect what platform we're building on @@ -38,7 +38,7 @@ endif CFLAGS = -c -I. -I./include $(PORT_CFLAGS) $(PLATFORM_CFLAGS) $(OPT) $(SNAPPY_CFLAGS) -LDFLAGS=$(PLATFORM_LDFLAGS) $(SNAPPY_LDFLAGS) $(GOOGLE_PERFTOOLS_LDFLAGS) +LDFLAGS += $(PLATFORM_LDFLAGS) $(SNAPPY_LDFLAGS) $(GOOGLE_PERFTOOLS_LDFLAGS) LIBOBJECTS = \ ./db/builder.o \ diff --git a/db/db_test.cc b/db/db_test.cc index f1cb949..5dc3b02 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1146,26 +1146,50 @@ TEST(DBTest, CustomComparator) { public: virtual const char* Name() const { return "test.NumberComparator"; } virtual int Compare(const Slice& a, const Slice& b) const { - return (strtol(a.ToString().c_str(), NULL, 0) - - strtol(b.ToString().c_str(), NULL, 0)); + return ToNumber(a) - ToNumber(b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + ToNumber(*s); // Check format + ToNumber(l); // Check format + } + virtual void FindShortSuccessor(std::string* key) const { + ToNumber(*key); // Check format + } + private: + static int ToNumber(const Slice& x) { + // Check that there are no extra characters. + ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']') + << EscapeString(x); + int val; + char ignored; + ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) + << EscapeString(x); + return val; } - virtual void FindShortestSeparator(std::string* s, const Slice& l) const {} - virtual void FindShortSuccessor(std::string* key) const {} }; NumberComparator cmp; Options new_options; new_options.create_if_missing = true; new_options.comparator = &cmp; + new_options.write_buffer_size = 1000; // Compact more often DestroyAndReopen(&new_options); - ASSERT_OK(Put("10", "ten")); - ASSERT_OK(Put("0x14", "twenty")); + ASSERT_OK(Put("[10]", "ten")); + ASSERT_OK(Put("[0x14]", "twenty")); for (int i = 0; i < 2; i++) { - ASSERT_EQ("ten", Get("10")); - ASSERT_EQ("ten", Get("0xa")); - ASSERT_EQ("twenty", Get("20")); - ASSERT_EQ("twenty", Get("0x14")); - Compact("0", "9999"); - fprintf(stderr, "ss\n%s\n", DumpSSTableList().c_str()); + ASSERT_EQ("ten", Get("[10]")); + ASSERT_EQ("ten", Get("[0xa]")); + ASSERT_EQ("twenty", Get("[20]")); + ASSERT_EQ("twenty", Get("[0x14]")); + Compact("[0]", "[9999]"); + } + + for (int run = 0; run < 2; run++) { + for (int i = 0; i < 1000; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "[%d]", i*10); + ASSERT_OK(Put(buf, buf)); + } + Compact("[0]", "[1000000]"); } } diff --git a/db/dbformat.cc b/db/dbformat.cc index 4594a57..9168f99 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -73,9 +73,10 @@ void InternalKeyComparator::FindShortestSeparator( Slice user_limit = ExtractUserKey(limit); std::string tmp(user_start.data(), user_start.size()); user_comparator_->FindShortestSeparator(&tmp, user_limit); - if (user_comparator_->Compare(*start, tmp) < 0) { - // User key has become larger. Tack on the earliest possible - // number to the shortened user key. + if (tmp.size() < user_start.size() && + user_comparator_->Compare(user_start, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); assert(this->Compare(*start, tmp) < 0); assert(this->Compare(tmp, limit) < 0); @@ -87,9 +88,10 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const { Slice user_key = ExtractUserKey(*key); std::string tmp(user_key.data(), user_key.size()); user_comparator_->FindShortSuccessor(&tmp); - if (user_comparator_->Compare(user_key, tmp) < 0) { - // User key has become larger. Tack on the earliest possible - // number to the shortened user key. + if (tmp.size() < user_key.size() && + user_comparator_->Compare(user_key, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); assert(this->Compare(*key, tmp) < 0); key->swap(tmp); diff --git a/util/comparator.cc b/util/comparator.cc index cfb49ce..eed9d2f 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -64,10 +64,13 @@ class BytewiseComparatorImpl : public Comparator { } }; } // namespace -static const BytewiseComparatorImpl bytewise; + +// Intentionally not destroyed to prevent destructor racing +// with background threads. +static const Comparator* bytewise = new BytewiseComparatorImpl; const Comparator* BytewiseComparator() { - return &bytewise; + return bytewise; } } // namespace leveldb -- cgit v1.2.3 From b14d5e1b0796330467b3d1535589270b1a173cc6 Mon Sep 17 00:00:00 2001 From: "hans@chromium.org" Date: Wed, 30 Nov 2011 11:08:02 +0000 Subject: Makefile fixes for systems with $CXX other than g++. - Makefile: Use $(CXX) for compiling C++ files, don't override the environment's value of $CXX - build_detect_platform: use $CXX instead of g++. Based on bug report from Theo Schlossnagle: http://code.google.com/p/leveldb/issues/detail?id=46 (Sync with uptream at 25807040.) git-svn-id: http://leveldb.googlecode.com/svn/trunk@56 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 49 +++++++++++++++++++++++++------------------------ build_detect_platform | 10 +++++++--- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index 8385d44..d337905 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. See the AUTHORS file for names of contributors. -CC = g++ +CXX ?= g++ +CC ?= gcc #----------------------------------------------- # Uncomment exactly one of the lines labelled (A), (B), and (C) below @@ -120,68 +121,68 @@ $(LIBRARY): $(LIBOBJECTS) $(AR) -rs $@ $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) - $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + $(CXX) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) - $(CC) $(LDFLAGS) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + $(CXX) $(LDFLAGS) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) - $(CC) $(LDFLAGS) -lkyotocabinet doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + $(CXX) $(LDFLAGS) -lkyotocabinet doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(MEMENVLIBRARY) : helpers/memenv/memenv.o rm -f $@ $(AR) -rs $@ helpers/memenv/memenv.o memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) - $(CC) $(LDFLAGS) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ + $(CXX) $(LDFLAGS) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ ifeq ($(PLATFORM), IOS) # For iOS, create universal object files to be used on both the simulator and @@ -192,9 +193,9 @@ IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version .cc.o: mkdir -p ios-x86/$(dir $@) - $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ + $(SIMULATORROOT)/usr/bin/$(CXX) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ + $(DEVICEROOT)/usr/bin/$(CXX) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ .c.o: @@ -206,7 +207,7 @@ IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version else .cc.o: - $(CC) $(CFLAGS) $< -o $@ + $(CXX) $(CFLAGS) $< -o $@ .c.o: $(CC) $(CFLAGS) $< -o $@ diff --git a/build_detect_platform b/build_detect_platform index d1804e0..5ab15ef 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -13,6 +13,10 @@ # Delete existing build_config.mk rm -f build_config.mk +if test -z "$CXX"; then + CXX=g++ +fi + # Detect OS case `uname -s` in Darwin) @@ -48,8 +52,8 @@ PORT_CFLAGS="-fno-builtin-memcmp" # Detect C++0x -- this determines whether we'll use port_noatomic.h # or port_posix.h by: # 1. Rrying to compile with -std=c++0x and including . -# 2. If g++ returns error code, we know to use port_posix.h -g++ $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -61,7 +65,7 @@ fi # Test whether Snappy library is installed # http://code.google.com/p/snappy/ -g++ $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF -- cgit v1.2.3 From e05bd5cade19e5de0f763f4f122eef9f35de3d9c Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Wed, 25 Jan 2012 23:07:20 +0000 Subject: fixed issues 66 (leaking files on disk error) and 68 (no sync of CURRENT file) git-svn-id: http://leveldb.googlecode.com/svn/trunk@57 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_impl.cc | 20 ++++++------------ db/db_test.cc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++------- db/filename.cc | 6 +++++- util/env.cc | 18 ++++++++++++++-- util/env_test.cc | 22 +++++++++++--------- 5 files changed, 95 insertions(+), 34 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index b4df80d..7b268ea 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -655,6 +655,8 @@ void DBImpl::BackgroundCompaction() { CompactionState* compact = new CompactionState(c); status = DoCompactionWork(compact); CleanupCompaction(compact); + c->ReleaseInputs(); + DeleteObsoleteFiles(); } delete c; @@ -672,6 +674,9 @@ void DBImpl::BackgroundCompaction() { if (is_manual) { ManualCompaction* m = manual_compaction_; + if (!status.ok()) { + m->done = true; + } if (!m->done) { // We only compacted part of the requested range. Update *m // to the range that is left to be compacted. @@ -793,21 +798,8 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { compact->compaction->edit()->AddFile( level + 1, out.number, out.file_size, out.smallest, out.largest); - pending_outputs_.erase(out.number); } - compact->outputs.clear(); - - Status s = versions_->LogAndApply(compact->compaction->edit(), &mutex_); - if (s.ok()) { - compact->compaction->ReleaseInputs(); - DeleteObsoleteFiles(); - } else { - // Discard any files we may have created during this failed compaction - for (size_t i = 0; i < compact->outputs.size(); i++) { - env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number)); - } - } - return s; + return versions_->LogAndApply(compact->compaction->edit(), &mutex_); } Status DBImpl::DoCompactionWork(CompactionState* compact) { diff --git a/db/db_test.cc b/db/db_test.cc index 5dc3b02..8318885 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -28,8 +28,12 @@ class SpecialEnv : public EnvWrapper { // sstable Sync() calls are blocked while this pointer is non-NULL. port::AtomicPointer delay_sstable_sync_; + // Simulate no-space errors while this pointer is non-NULL. + port::AtomicPointer no_space_; + explicit SpecialEnv(Env* base) : EnvWrapper(base) { delay_sstable_sync_.Release_Store(NULL); + no_space_.Release_Store(NULL); } Status NewWritableFile(const std::string& f, WritableFile** r) { @@ -44,7 +48,14 @@ class SpecialEnv : public EnvWrapper { base_(base) { } ~SSTableFile() { delete base_; } - Status Append(const Slice& data) { return base_->Append(data); } + Status Append(const Slice& data) { + if (env_->no_space_.Acquire_Load() != NULL) { + // Drop writes on the floor + return Status::OK(); + } else { + return base_->Append(data); + } + } Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { @@ -239,6 +250,12 @@ class DBTest { return result; } + int CountFiles() { + std::vector files; + env_->GetChildren(dbname_, &files); + return static_cast(files.size()); + } + uint64_t Size(const Slice& start, const Slice& limit) { Range r(start, limit); uint64_t size; @@ -1266,6 +1283,37 @@ TEST(DBTest, DBOpen_Options) { db = NULL; } +// Check that number of files does not grow when we are out of space +TEST(DBTest, NoSpace) { + Options options; + options.env = env_; + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + const int num_files = CountFiles(); + env_->no_space_.Release_Store(env_); // Force out-of-space errors + for (int i = 0; i < 10; i++) { + for (int level = 0; level < config::kNumLevels-1; level++) { + dbfull()->TEST_CompactRange(level, NULL, NULL); + } + } + env_->no_space_.Release_Store(NULL); + ASSERT_LT(CountFiles(), num_files + 5); +} + +TEST(DBTest, FilesDeletedAfterCompaction) { + ASSERT_OK(Put("foo", "v2")); + Compact("a", "z"); + const int num_files = CountFiles(); + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("foo", "v2")); + Compact("a", "z"); + } + ASSERT_EQ(CountFiles(), num_files); +} + // Multi-threaded test: namespace { @@ -1287,14 +1335,15 @@ struct MTThread { static void MTThreadBody(void* arg) { MTThread* t = reinterpret_cast(arg); + int id = t->id; DB* db = t->state->test->db_; uintptr_t counter = 0; - fprintf(stderr, "... starting thread %d\n", t->id); - Random rnd(1000 + t->id); + fprintf(stderr, "... starting thread %d\n", id); + Random rnd(1000 + id); std::string value; char valbuf[1500]; while (t->state->stop.Acquire_Load() == NULL) { - t->state->counter[t->id].Release_Store(reinterpret_cast(counter)); + t->state->counter[id].Release_Store(reinterpret_cast(counter)); int key = rnd.Uniform(kNumKeys); char keybuf[20]; @@ -1304,7 +1353,7 @@ static void MTThreadBody(void* arg) { // Write values of the form . // We add some padding for force compactions. snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d", - key, t->id, static_cast(counter)); + key, id, static_cast(counter)); ASSERT_OK(db->Put(WriteOptions(), Slice(keybuf), Slice(valbuf))); } else { // Read a value and verify that it matches the pattern written above. @@ -1325,8 +1374,8 @@ static void MTThreadBody(void* arg) { } counter++; } - t->state->thread_done[t->id].Release_Store(t); - fprintf(stderr, "... stopping thread %d after %d ops\n", t->id, int(counter)); + t->state->thread_done[id].Release_Store(t); + fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); } } // namespace diff --git a/db/filename.cc b/db/filename.cc index 24fd140..3c4d49f 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -11,6 +11,10 @@ namespace leveldb { +// A utility routine: write "data" to the named file and Sync() it. +extern Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname); + static std::string MakeFileName(const std::string& name, uint64_t number, const char* suffix) { char buf[100]; @@ -122,7 +126,7 @@ Status SetCurrentFile(Env* env, const std::string& dbname, assert(contents.starts_with(dbname + "/")); contents.remove_prefix(dbname.size() + 1); std::string tmp = TempFileName(dbname, descriptor_number); - Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp); + Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp); if (s.ok()) { s = env->RenameFile(tmp, CurrentFileName(dbname)); } diff --git a/util/env.cc b/util/env.cc index 594811b..c2600e9 100644 --- a/util/env.cc +++ b/util/env.cc @@ -33,14 +33,18 @@ void Log(Logger* info_log, const char* format, ...) { } } -Status WriteStringToFile(Env* env, const Slice& data, - const std::string& fname) { +static Status DoWriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync) { WritableFile* file; Status s = env->NewWritableFile(fname, &file); if (!s.ok()) { return s; } s = file->Append(data); + if (s.ok() && should_sync) { + s = file->Sync(); + } if (s.ok()) { s = file->Close(); } @@ -51,6 +55,16 @@ Status WriteStringToFile(Env* env, const Slice& data, return s; } +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, false); +} + +Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, true); +} + Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { data->clear(); SequentialFile* file; diff --git a/util/env_test.cc b/util/env_test.cc index 3f8a8a2..b72cb44 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -22,29 +22,30 @@ class EnvPosixTest { }; static void SetBool(void* ptr) { - *(reinterpret_cast(ptr)) = true; + reinterpret_cast(ptr)->NoBarrier_Store(ptr); } TEST(EnvPosixTest, RunImmediately) { - bool called = false; + port::AtomicPointer called (NULL); env_->Schedule(&SetBool, &called); Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_TRUE(called); + ASSERT_TRUE(called.NoBarrier_Load() != NULL); } TEST(EnvPosixTest, RunMany) { - int last_id = 0; + port::AtomicPointer last_id (NULL); struct CB { - int* last_id_ptr; // Pointer to shared slot - int id; // Order# for the execution of this callback + port::AtomicPointer* last_id_ptr; // Pointer to shared slot + uintptr_t id; // Order# for the execution of this callback - CB(int* p, int i) : last_id_ptr(p), id(i) { } + CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { } static void Run(void* v) { CB* cb = reinterpret_cast(v); - ASSERT_EQ(cb->id-1, *cb->last_id_ptr); - *cb->last_id_ptr = cb->id; + void* cur = cb->last_id_ptr->NoBarrier_Load(); + ASSERT_EQ(cb->id-1, reinterpret_cast(cur)); + cb->last_id_ptr->Release_Store(reinterpret_cast(cb->id)); } }; @@ -59,7 +60,8 @@ TEST(EnvPosixTest, RunMany) { env_->Schedule(&CB::Run, &cb4); Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_EQ(4, last_id); + void* cur = last_id.Acquire_Load(); + ASSERT_EQ(4, reinterpret_cast(cur)); } struct State { -- cgit v1.2.3 From 56addd362f70667d374aaa686f90eeae5c23d486 Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Fri, 3 Feb 2012 16:16:14 +0000 Subject: avoid very large compactions; fix build on Linux git-svn-id: http://leveldb.googlecode.com/svn/trunk@58 62dab493-f737-651d-591e-8d6aee1b9529 --- build_detect_platform | 2 +- db/version_set.cc | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/build_detect_platform b/build_detect_platform index 5ab15ef..d8d9ba1 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -27,7 +27,7 @@ case `uname -s` in Linux) PLATFORM=OS_LINUX echo "PLATFORM_CFLAGS=-pthread -DOS_LINUX" >> build_config.mk - echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk + echo "PLATFORM_LDFLAGS=-pthread" >> build_config.mk ;; SunOS) PLATFORM=OS_SOLARIS diff --git a/db/version_set.cc b/db/version_set.cc index 7cf5197..1310aeb 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -26,6 +26,11 @@ static const int kTargetFileSize = 2 * 1048576; // stop building a single file in a level->level+1 compaction. static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; +// Maximum number of bytes in all compacted files. We avoid expanding +// the lower level file set of a compaction if it would make the +// total compaction cover more than this many bytes. +static const int64_t kExpandedCompactionByteSizeLimit = 25 * kTargetFileSize; + static double MaxBytesForLevel(int level) { // Note: the result for level zero is not really used since we set // the level-0 compaction threshold based on number of files. @@ -1223,7 +1228,11 @@ void VersionSet::SetupOtherInputs(Compaction* c) { if (!c->inputs_[1].empty()) { std::vector expanded0; current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0); - if (expanded0.size() > c->inputs_[0].size()) { + const int64_t inputs0_size = TotalFileSize(c->inputs_[0]); + const int64_t inputs1_size = TotalFileSize(c->inputs_[1]); + const int64_t expanded0_size = TotalFileSize(expanded0); + if (expanded0.size() > c->inputs_[0].size() && + inputs1_size + expanded0_size < kExpandedCompactionByteSizeLimit) { InternalKey new_start, new_limit; GetRange(expanded0, &new_start, &new_limit); std::vector expanded1; @@ -1231,12 +1240,14 @@ void VersionSet::SetupOtherInputs(Compaction* c) { &expanded1); if (expanded1.size() == c->inputs_[1].size()) { Log(options_->info_log, - "Expanding@%d %d+%d to %d+%d\n", + "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n", level, int(c->inputs_[0].size()), int(c->inputs_[1].size()), + long(inputs0_size), long(inputs1_size), int(expanded0.size()), - int(expanded1.size())); + int(expanded1.size()), + long(expanded0_size), long(inputs1_size)); smallest = new_start; largest = new_limit; c->inputs_[0] = expanded0; -- cgit v1.2.3 From 3366031b7b6638f4c1d4c80a501b12e0c83a92e4 Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Mon, 5 Mar 2012 18:38:07 +0000 Subject: add .gitignore; support for building on a few BSD variants git-svn-id: http://leveldb.googlecode.com/svn/trunk@59 62dab493-f737-651d-591e-8d6aee1b9529 --- .gitignore | 5 +++++ build_detect_platform | 15 +++++++++++++++ port/port_posix.h | 14 +++++++++++--- 3 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..46769e0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +build_config.mk +*.a +*.o +*_test +db_bench diff --git a/build_detect_platform b/build_detect_platform index d8d9ba1..5f9e021 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -39,6 +39,21 @@ case `uname -s` in echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_FREEBSD" >> build_config.mk echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk ;; + NetBSD) + PLATFORM=OS_NETBSD + echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_NETBSD" >> build_config.mk + echo "PLATFORM_LDFLAGS=-lpthread -lgcc_s" >> build_config.mk + ;; + OpenBSD) + PLATFORM=OS_OPENBSD + echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_OPENBSD" >> build_config.mk + echo "PLATFORM_LDFLAGS=-pthread" >> build_config.mk + ;; + DragonFly) + PLATFORM=OS_DRAGONFLYBSD + echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_DRAGONFLYBSD" >> build_config.mk + echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk + ;; *) echo "Unknown platform!" exit 1 diff --git a/port/port_posix.h b/port/port_posix.h index 9666391..485ad10 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -7,7 +7,7 @@ #ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ #define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ -#if defined(OS_MACOSX) || defined(OS_FREEBSD) +#if defined(OS_MACOSX) #include #elif defined(OS_SOLARIS) #include @@ -16,6 +16,10 @@ #else #define BIG_ENDIAN #endif +#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\ + defined(OS_DRAGONFLYBSD) + #include + #include #else #include #endif @@ -33,13 +37,17 @@ #define IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) #endif -#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) +#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ + defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) +// Use fread/fwrite/fflush on platforms without _unlocked variants #define fread_unlocked fread #define fwrite_unlocked fwrite #define fflush_unlocked fflush #endif -#if defined(OS_MACOSX) || defined(OS_FREEBSD) +#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\ + defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) +// Use fsync() on platforms without fdatasync() #define fdatasync fsync #endif -- cgit v1.2.3 From 13daa9f29c999ee40a257ee0775abee2c78a0ad9 Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Fri, 9 Mar 2012 00:27:49 +0000 Subject: added group commit; drastically speeds up mult-threaded synchronous write workloads git-svn-id: http://leveldb.googlecode.com/svn/trunk@60 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_impl.cc | 143 ++++++++++++++++++++++++++++++++++------------ db/db_impl.h | 18 +++--- db/write_batch.cc | 17 ++++-- db/write_batch_internal.h | 2 + db/write_batch_test.cc | 31 ++++++++++ 5 files changed, 162 insertions(+), 49 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 7b268ea..dde3711 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -35,6 +35,17 @@ namespace leveldb { +// Information kept for every waiting writer +struct DBImpl::Writer { + Status status; + WriteBatch* batch; + bool sync; + bool done; + port::CondVar cv; + + explicit Writer(port::Mutex* mu) : cv(mu) { } +}; + struct DBImpl::CompactionState { Compaction* const compaction; @@ -113,8 +124,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) logfile_(NULL), logfile_number_(0), log_(NULL), - logger_(NULL), - logger_cv_(&mutex_), + tmp_batch_(new WriteBatch), bg_compaction_scheduled_(false), manual_compaction_(NULL) { mem_->Ref(); @@ -144,6 +154,7 @@ DBImpl::~DBImpl() { delete versions_; if (mem_ != NULL) mem_->Unref(); if (imm_ != NULL) imm_->Unref(); + delete tmp_batch_; delete log_; delete logfile_; delete table_cache_; @@ -554,13 +565,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { } Status DBImpl::TEST_CompactMemTable() { - MutexLock l(&mutex_); - LoggerId self; - AcquireLoggingResponsibility(&self); - Status s = MakeRoomForWrite(true /* force compaction */); - ReleaseLoggingResponsibility(&self); + // NULL batch means just wait for earlier writes to be done + Status s = Write(WriteOptions(), NULL); if (s.ok()) { // Wait until the compaction completes + MutexLock l(&mutex_); while (imm_ != NULL && bg_error_.ok()) { bg_cv_.Wait(); } @@ -1094,38 +1103,35 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { return DB::Delete(options, key); } -// There is at most one thread that is the current logger. This call -// waits until preceding logger(s) have finished and becomes the -// current logger. -void DBImpl::AcquireLoggingResponsibility(LoggerId* self) { - while (logger_ != NULL) { - logger_cv_.Wait(); - } - logger_ = self; -} +Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { + Writer w(&mutex_); + w.batch = my_batch; + w.sync = options.sync; + w.done = false; -void DBImpl::ReleaseLoggingResponsibility(LoggerId* self) { - assert(logger_ == self); - logger_ = NULL; - logger_cv_.SignalAll(); -} - -Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { - Status status; MutexLock l(&mutex_); - LoggerId self; - AcquireLoggingResponsibility(&self); - status = MakeRoomForWrite(false); // May temporarily release lock and wait + writers_.push_back(&w); + while (!w.done && &w != writers_.front()) { + w.cv.Wait(); + } + if (w.done) { + return w.status; + } + + // May temporarily unlock and wait. + Status status = MakeRoomForWrite(my_batch == NULL); uint64_t last_sequence = versions_->LastSequence(); - if (status.ok()) { + Writer* last_writer = &w; + if (status.ok() && my_batch != NULL) { // NULL batch is for compactions + WriteBatch* updates = BuildBatchGroup(&last_writer); WriteBatchInternal::SetSequence(updates, last_sequence + 1); last_sequence += WriteBatchInternal::Count(updates); - // Add to log and apply to memtable. We can release the lock during - // this phase since the "logger_" flag protects against concurrent - // loggers and concurrent writes into mem_. + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into mem_. { - assert(logger_ == &self); mutex_.Unlock(); status = log_->AddRecord(WriteBatchInternal::Contents(updates)); if (status.ok() && options.sync) { @@ -1135,20 +1141,85 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { status = WriteBatchInternal::InsertInto(updates, mem_); } mutex_.Lock(); - assert(logger_ == &self); } + if (updates == tmp_batch_) tmp_batch_->Clear(); versions_->SetLastSequence(last_sequence); } - ReleaseLoggingResponsibility(&self); + + while (true) { + Writer* ready = writers_.front(); + writers_.pop_front(); + if (ready != &w) { + ready->status = status; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + + // Notify new head of write queue + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } + return status; } +// REQUIRES: Writer list must be non-empty +// REQUIRES: First writer must have a non-NULL batch +WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { + assert(!writers_.empty()); + Writer* first = writers_.front(); + WriteBatch* result = first->batch; + assert(result != NULL); + + size_t size = WriteBatchInternal::ByteSize(first->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = 1 << 20; + if (size <= (128<<10)) { + max_size = size + (128<<10); + } + + *last_writer = first; + std::deque::iterator iter = writers_.begin(); + ++iter; // Advance past "first" + for (; iter != writers_.end(); ++iter) { + Writer* w = *iter; + if (w->sync && !first->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (w->batch != NULL) { + size += WriteBatchInternal::ByteSize(w->batch); + if (size > max_size) { + // Do not make batch too big + break; + } + + // Append to *reuslt + if (result == first->batch) { + // Switch to temporary batch instead of disturbing caller's batch + result = tmp_batch_; + assert(WriteBatchInternal::Count(result) == 0); + WriteBatchInternal::Append(result, first->batch); + } + WriteBatchInternal::Append(result, w->batch); + } + *last_writer = w; + } + return result; +} + // REQUIRES: mutex_ is held -// REQUIRES: this thread is the current logger +// REQUIRES: this thread is currently at the front of the writer queue Status DBImpl::MakeRoomForWrite(bool force) { mutex_.AssertHeld(); - assert(logger_ != NULL); + assert(!writers_.empty()); bool allow_delay = !force; Status s; while (true) { diff --git a/db/db_impl.h b/db/db_impl.h index fc40d1e..e665c0e 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -5,6 +5,7 @@ #ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ #define STORAGE_LEVELDB_DB_DB_IMPL_H_ +#include #include #include "db/dbformat.h" #include "db/log_writer.h" @@ -59,6 +60,8 @@ class DBImpl : public DB { private: friend class DB; + struct CompactionState; + struct Writer; Iterator* NewInternalIterator(const ReadOptions&, SequenceNumber* latest_snapshot); @@ -85,14 +88,8 @@ class DBImpl : public DB { Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base); - // Only thread is allowed to log at a time. - struct LoggerId { }; // Opaque identifier for logging thread - void AcquireLoggingResponsibility(LoggerId* self); - void ReleaseLoggingResponsibility(LoggerId* self); - Status MakeRoomForWrite(bool force /* compact even if there is room? */); - - struct CompactionState; + WriteBatch* BuildBatchGroup(Writer** last_writer); void MaybeScheduleCompaction(); static void BGWork(void* db); @@ -129,8 +126,11 @@ class DBImpl : public DB { WritableFile* logfile_; uint64_t logfile_number_; log::Writer* log_; - LoggerId* logger_; // NULL, or the id of the current logging thread - port::CondVar logger_cv_; // For threads waiting to log + + // Queue of writers. + std::deque writers_; + WriteBatch* tmp_batch_; + SnapshotList snapshots_; // Set of table files to protect from deletion because they are diff --git a/db/write_batch.cc b/db/write_batch.cc index a0e812f..33f4a42 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -23,6 +23,9 @@ namespace leveldb { +// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. +static const size_t kHeader = 12; + WriteBatch::WriteBatch() { Clear(); } @@ -33,16 +36,16 @@ WriteBatch::Handler::~Handler() { } void WriteBatch::Clear() { rep_.clear(); - rep_.resize(12); + rep_.resize(kHeader); } Status WriteBatch::Iterate(Handler* handler) const { Slice input(rep_); - if (input.size() < 12) { + if (input.size() < kHeader) { return Status::Corruption("malformed WriteBatch (too small)"); } - input.remove_prefix(12); + input.remove_prefix(kHeader); Slice key, value; int found = 0; while (!input.empty()) { @@ -131,8 +134,14 @@ Status WriteBatchInternal::InsertInto(const WriteBatch* b, } void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { - assert(contents.size() >= 12); + assert(contents.size() >= kHeader); b->rep_.assign(contents.data(), contents.size()); } +void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { + SetCount(dst, Count(dst) + Count(src)); + assert(src->rep_.size() >= kHeader); + dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); +} + } // namespace leveldb diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 49aeb84..4423a7f 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -39,6 +39,8 @@ class WriteBatchInternal { static void SetContents(WriteBatch* batch, const Slice& contents); static Status InsertInto(const WriteBatch* batch, MemTable* memtable); + + static void Append(WriteBatch* dst, const WriteBatch* src); }; } // namespace leveldb diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 1ee6d7b..9064e3d 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -18,6 +18,7 @@ static std::string PrintContents(WriteBatch* b) { mem->Ref(); std::string state; Status s = WriteBatchInternal::InsertInto(b, mem); + int count = 0; Iterator* iter = mem->NewIterator(); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; @@ -29,11 +30,13 @@ static std::string PrintContents(WriteBatch* b) { state.append(", "); state.append(iter->value().ToString()); state.append(")"); + count++; break; case kTypeDeletion: state.append("Delete("); state.append(ikey.user_key.ToString()); state.append(")"); + count++; break; } state.append("@"); @@ -42,6 +45,8 @@ static std::string PrintContents(WriteBatch* b) { delete iter; if (!s.ok()) { state.append("ParseError()"); + } else if (count != WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); } mem->Unref(); return state; @@ -82,6 +87,32 @@ TEST(WriteBatchTest, Corruption) { PrintContents(&batch)); } +TEST(WriteBatchTest, Append) { + WriteBatch b1, b2; + WriteBatchInternal::SetSequence(&b1, 200); + WriteBatchInternal::SetSequence(&b2, 300); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("", + PrintContents(&b1)); + b2.Put("a", "va"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200", + PrintContents(&b1)); + b2.Clear(); + b2.Put("b", "vb"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@201", + PrintContents(&b1)); + b2.Delete("foo"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Delete(foo)@203", + PrintContents(&b1)); +} + } // namespace leveldb int main(int argc, char** argv) { -- cgit v1.2.3 From f168d0177b095ac7a608f6aafb9efc96976b6b3c Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Fri, 9 Mar 2012 15:53:05 +0000 Subject: fix LOCK file deletion to prevent crash on windows git-svn-id: http://leveldb.googlecode.com/svn/trunk@61 62dab493-f737-651d-591e-8d6aee1b9529 --- db/db_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index dde3711..88d17e7 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1427,7 +1427,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) { FileType type; for (size_t i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && - filenames[i] != lockname) { // Lock file will be deleted at end + type != kDBLockFile) { // Lock file will be deleted at end Status del = env->DeleteFile(dbname + "/" + filenames[i]); if (result.ok() && !del.ok()) { result = del; -- cgit v1.2.3 From e83668fa64e932a64712c99398be0acfe75367af Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Thu, 15 Mar 2012 16:24:26 +0000 Subject: use mmap on 64-bit machines to speed-up reads; small build fixes git-svn-id: http://leveldb.googlecode.com/svn/trunk@62 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 36 ++++++++++++++++++------------------ table/block.cc | 9 ++++++--- table/block.h | 6 ++++-- table/format.cc | 19 +++++++++++++------ table/format.h | 5 ++++- table/table.cc | 11 +++++++---- table/table_test.cc | 2 +- util/env_posix.cc | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 8 files changed, 99 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index d337905..c284d4c 100644 --- a/Makefile +++ b/Makefile @@ -124,65 +124,65 @@ db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + $(CXX) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LDFLAGS db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) $(LDFLAGS) -lkyotocabinet doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ + $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) $(MEMENVLIBRARY) : helpers/memenv/memenv.o rm -f $@ $(AR) -rs $@ helpers/memenv/memenv.o memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) - $(CXX) $(LDFLAGS) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ + $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS) ifeq ($(PLATFORM), IOS) # For iOS, create universal object files to be used on both the simulator and diff --git a/table/block.cc b/table/block.cc index 40aa318..06eb6f8 100644 --- a/table/block.cc +++ b/table/block.cc @@ -19,9 +19,10 @@ inline uint32_t Block::NumRestarts() const { return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } -Block::Block(const char* data, size_t size) +Block::Block(const char* data, size_t size, bool take_ownership) : data_(data), - size_(size) { + size_(size), + owned_(take_ownership) { if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { @@ -35,7 +36,9 @@ Block::Block(const char* data, size_t size) } Block::~Block() { - delete[] data_; + if (owned_) { + delete[] data_; + } } // Helper routine: decode the next block entry starting at "p", diff --git a/table/block.h b/table/block.h index 9eb6f02..76088a4 100644 --- a/table/block.h +++ b/table/block.h @@ -16,8 +16,9 @@ class Comparator; class Block { public: // Initialize the block with the specified contents. - // Takes ownership of data[] and will delete[] it when done. - Block(const char* data, size_t size); + // Takes ownership of data[] and will delete[] it when done iff + // "take_ownership is true. + Block(const char* data, size_t size, bool take_ownership); ~Block(); @@ -30,6 +31,7 @@ class Block { const char* data_; size_t size_; uint32_t restart_offset_; // Offset in data_ of restart array + bool owned_; // Block owns data_[] // No copying allowed Block(const Block&); diff --git a/table/format.cc b/table/format.cc index 23728d8..25b85a2 100644 --- a/table/format.cc +++ b/table/format.cc @@ -66,8 +66,10 @@ Status Footer::DecodeFrom(Slice* input) { Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - Block** block) { + Block** block, + bool* may_cache) { *block = NULL; + *may_cache = false; // Read the block contents as well as the type/crc footer. // See table_builder.cc for the code that built this structure. @@ -100,8 +102,14 @@ Status ReadBlock(RandomAccessFile* file, case kNoCompression: if (data != buf) { // File implementation gave us pointer to some other data. - // Copy into buf[]. - memcpy(buf, data, n + kBlockTrailerSize); + // Use it directly under the assumption that it will be live + // while the file is open. + delete[] buf; + *block = new Block(data, n, false /* do not take ownership */); + *may_cache = false; // Do not double-cache + } else { + *block = new Block(buf, n, true /* take ownership */); + *may_cache = true; } // Ok @@ -119,8 +127,8 @@ Status ReadBlock(RandomAccessFile* file, return Status::Corruption("corrupted compressed block contents"); } delete[] buf; - buf = ubuf; - n = ulength; + *block = new Block(ubuf, ulength, true /* take ownership */); + *may_cache = true; break; } default: @@ -128,7 +136,6 @@ Status ReadBlock(RandomAccessFile* file, return Status::Corruption("bad block type"); } - *block = new Block(buf, n); // Block takes ownership of buf[] return Status::OK(); } diff --git a/table/format.h b/table/format.h index 2a3e1ac..66a15da 100644 --- a/table/format.h +++ b/table/format.h @@ -86,10 +86,13 @@ static const size_t kBlockTrailerSize = 5; // Read the block identified by "handle" from "file". On success, // store a pointer to the heap-allocated result in *block and return // OK. On failure store NULL in *block and return non-OK. +// On success, stores true in *may_cache if the result may be +// cached, false if it must not be cached. extern Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - Block** block); + Block** block, + bool* may_cache); // Implementation details follow. Clients should ignore, diff --git a/table/table.cc b/table/table.cc index 5f9238e..07dcffd 100644 --- a/table/table.cc +++ b/table/table.cc @@ -49,7 +49,9 @@ Status Table::Open(const Options& options, // Read the index block Block* index_block = NULL; if (s.ok()) { - s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block); + bool may_cache; // Ignored result + s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block, + &may_cache); } if (s.ok()) { @@ -105,6 +107,7 @@ Iterator* Table::BlockReader(void* arg, // can add more features in the future. if (s.ok()) { + bool may_cache; if (block_cache != NULL) { char cache_key_buffer[16]; EncodeFixed64(cache_key_buffer, table->rep_->cache_id); @@ -114,14 +117,14 @@ Iterator* Table::BlockReader(void* arg, if (cache_handle != NULL) { block = reinterpret_cast(block_cache->Value(cache_handle)); } else { - s = ReadBlock(table->rep_->file, options, handle, &block); - if (s.ok() && options.fill_cache) { + s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache); + if (s.ok() && may_cache && options.fill_cache) { cache_handle = block_cache->Insert( key, block, block->size(), &DeleteCachedBlock); } } } else { - s = ReadBlock(table->rep_->file, options, handle, &block); + s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache); } } diff --git a/table/table_test.cc b/table/table_test.cc index cd85b4b..0c8e676 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -205,7 +205,7 @@ class BlockConstructor: public Constructor { block_size_ = block_data.size(); char* block_data_copy = new char[block_size_]; memcpy(block_data_copy, block_data.data(), block_size_); - block_ = new Block(block_data_copy, block_size_); + block_ = new Block(block_data_copy, block_size_, true /* take ownership */); return Status::OK(); } virtual size_t NumBytes() const { return block_size_; } diff --git a/util/env_posix.cc b/util/env_posix.cc index cc73348..cb1f6fc 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -66,6 +66,7 @@ class PosixSequentialFile: public SequentialFile { } }; +// pread() based random-access class PosixRandomAccessFile: public RandomAccessFile { private: std::string filename_; @@ -89,6 +90,32 @@ class PosixRandomAccessFile: public RandomAccessFile { } }; +// mmap() based random-access +class PosixMmapReadableFile: public RandomAccessFile { + private: + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + // base[0,length-1] contains the mmapped contents of the file. + PosixMmapReadableFile(const std::string& fname, void* base, size_t length) + : filename_(fname), mmapped_region_(base), length_(length) { } + virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + if (offset + n > length_) { + *result = Slice(); + s = IOError(filename_, EINVAL); + } else { + *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + } + return s; + } +}; + // We preallocate up to an extra megabyte and use memcpy to append new // data to the file. This is safe since we either properly close the // file before reading from it, or for log files, the reading code @@ -297,13 +324,28 @@ class PosixEnv : public Env { virtual Status NewRandomAccessFile(const std::string& fname, RandomAccessFile** result) { + *result = NULL; + Status s; int fd = open(fname.c_str(), O_RDONLY); if (fd < 0) { - *result = NULL; - return IOError(fname, errno); + s = IOError(fname, errno); + } else if (sizeof(void*) >= 8) { + // Use mmap when virtual address-space is plentiful. + uint64_t size; + s = GetFileSize(fname, &size); + if (s.ok()) { + void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + *result = new PosixMmapReadableFile(fname, base, size); + } else { + s = IOError(fname, errno); + } + } + close(fd); + } else { + *result = new PosixRandomAccessFile(fname, fd); } - *result = new PosixRandomAccessFile(fname, fd); - return Status::OK(); + return s; } virtual Status NewWritableFile(const std::string& fname, -- cgit v1.2.3 From d44dcce3d3b9024df4e4391ed690ac138595d641 Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Wed, 21 Mar 2012 17:32:19 +0000 Subject: Build fixes and cleanups git-svn-id: http://leveldb.googlecode.com/svn/trunk@63 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 79 +++++---------------------- build_detect_platform | 146 ++++++++++++++++++++++++++++++++++---------------- port/port_android.h | 2 +- 3 files changed, 114 insertions(+), 113 deletions(-) diff --git a/Makefile b/Makefile index c284d4c..2858c6e 100644 --- a/Makefile +++ b/Makefile @@ -16,67 +16,16 @@ OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode) # detect what platform we're building on $(shell sh ./build_detect_platform) -# this file is generated by build_detect_platform to set build flags +# this file is generated by build_detect_platform to set build flags and sources include build_config.mk -# If Snappy is installed, add compilation and linker flags -# (see http://code.google.com/p/snappy/) -ifeq ($(SNAPPY), 1) -SNAPPY_CFLAGS=-DSNAPPY -SNAPPY_LDFLAGS=-lsnappy -else -SNAPPY_CFLAGS= -SNAPPY_LDFLAGS= -endif +CFLAGS += -c -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) +CXXFLAGS += -c -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -# If Google Perf Tools are installed, add compilation and linker flags -# (see http://code.google.com/p/google-perftools/) -ifeq ($(GOOGLE_PERFTOOLS), 1) -GOOGLE_PERFTOOLS_LDFLAGS=-ltcmalloc -else -GOOGLE_PERFTOOLS_LDFLAGS= -endif +LDFLAGS += $(PLATFORM_LDFLAGS) -CFLAGS = -c -I. -I./include $(PORT_CFLAGS) $(PLATFORM_CFLAGS) $(OPT) $(SNAPPY_CFLAGS) - -LDFLAGS += $(PLATFORM_LDFLAGS) $(SNAPPY_LDFLAGS) $(GOOGLE_PERFTOOLS_LDFLAGS) - -LIBOBJECTS = \ - ./db/builder.o \ - ./db/c.o \ - ./db/db_impl.o \ - ./db/db_iter.o \ - ./db/filename.o \ - ./db/dbformat.o \ - ./db/log_reader.o \ - ./db/log_writer.o \ - ./db/memtable.o \ - ./db/repair.o \ - ./db/table_cache.o \ - ./db/version_edit.o \ - ./db/version_set.o \ - ./db/write_batch.o \ - ./port/port_posix.o \ - ./table/block.o \ - ./table/block_builder.o \ - ./table/format.o \ - ./table/iterator.o \ - ./table/merger.o \ - ./table/table.o \ - ./table/table_builder.o \ - ./table/two_level_iterator.o \ - ./util/arena.o \ - ./util/cache.o \ - ./util/coding.o \ - ./util/comparator.o \ - ./util/crc32c.o \ - ./util/env.o \ - ./util/env_posix.o \ - ./util/hash.o \ - ./util/histogram.o \ - ./util/logging.o \ - ./util/options.o \ - ./util/status.o +LIBOBJECTS = $(SOURCES:.cc=.o) +MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o) TESTUTIL = ./util/testutil.o TESTHARNESS = ./util/testharness.o $(TESTUTIL) @@ -121,13 +70,13 @@ $(LIBRARY): $(LIBOBJECTS) $(AR) -rs $@ $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LDFLAGS) db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) -lsqlite3 doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LDFLAGS + $(CXX) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LDFLAGS) -lsqlite3 db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) -lkyotocabinet doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ + $(CXX) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LDFLAGS) -lkyotocabinet arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) @@ -177,9 +126,9 @@ version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) -$(MEMENVLIBRARY) : helpers/memenv/memenv.o +$(MEMENVLIBRARY) : $(MEMENVOBJECTS) rm -f $@ - $(AR) -rs $@ helpers/memenv/memenv.o + $(AR) -rs $@ $(MEMENVOBJECTS) memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS) @@ -193,9 +142,9 @@ IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version .cc.o: mkdir -p ios-x86/$(dir $@) - $(SIMULATORROOT)/usr/bin/$(CXX) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ + $(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(DEVICEROOT)/usr/bin/$(CXX) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ + $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ .c.o: @@ -207,7 +156,7 @@ IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version else .cc.o: - $(CXX) $(CFLAGS) $< -o $@ + $(CXX) $(CXXFLAGS) $< -o $@ .c.o: $(CC) $(CFLAGS) $< -o $@ diff --git a/build_detect_platform b/build_detect_platform index 5f9e021..df85264 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -1,93 +1,145 @@ #!/bin/sh - +# # Detects OS we're compiling on and generates build_config.mk, # which in turn gets read while processing Makefile. - +# # build_config.mk will set the following variables: -# - PORT_CFLAGS will either set: -# -DLEVELDB_PLATFORM_POSIX if cstatomic is present -# -DLEVELDB_PLATFORM_NOATOMIC if it is not -# - PLATFORM_CFLAGS with compiler flags for the platform -# - PLATFORM_LDFLAGS with linker flags for the platform +# PLATFORM_LDFLAGS Linker flags +# PLATFORM_CCFLAGS C compiler flags +# PLATFORM_CXXFLAGS C++ compiler flags. Will contain: +# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present +# -DLEVELDB_PLATFORM_NOATOMIC if it is not + +SCRIPT_DIR=`dirname $0` # Delete existing build_config.mk rm -f build_config.mk +touch build_config.mk if test -z "$CXX"; then CXX=g++ fi # Detect OS -case `uname -s` in +if test -z "$TARGET_OS"; then + TARGET_OS=`uname -s` +fi + +COMMON_FLAGS= +PLATFORM_CCFLAGS= +PLATFORM_CXXFLAGS= +PLATFORM_LDFLAGS= + +# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp +case "$TARGET_OS" in Darwin) PLATFORM=OS_MACOSX - echo "PLATFORM_CFLAGS=-DOS_MACOSX" >> build_config.mk - echo "PLATFORM_LDFLAGS=" >> build_config.mk + COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX" + PORT_FILE=port/port_posix.cc ;; Linux) PLATFORM=OS_LINUX - echo "PLATFORM_CFLAGS=-pthread -DOS_LINUX" >> build_config.mk - echo "PLATFORM_LDFLAGS=-pthread" >> build_config.mk + COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX" + PLATFORM_LDFLAGS="-pthread" + PORT_FILE=port/port_posix.cc ;; SunOS) PLATFORM=OS_SOLARIS - echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_SOLARIS" >> build_config.mk - echo "PLATFORM_LDFLAGS=-lpthread -lrt" >> build_config.mk + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS" + PLATFORM_LDFLAGS="-lpthread -lrt" + PORT_FILE=port/port_posix.cc ;; FreeBSD) PLATFORM=OS_FREEBSD - echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_FREEBSD" >> build_config.mk - echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD" + PLATFORM_LDFLAGS="-lpthread" + PORT_FILE=port/port_posix.cc ;; NetBSD) PLATFORM=OS_NETBSD - echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_NETBSD" >> build_config.mk - echo "PLATFORM_LDFLAGS=-lpthread -lgcc_s" >> build_config.mk + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" + PLATFORM_LDFLAGS="-lpthread -lgcc_s" + PORT_FILE=port/port_posix.cc ;; OpenBSD) PLATFORM=OS_OPENBSD - echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_OPENBSD" >> build_config.mk - echo "PLATFORM_LDFLAGS=-pthread" >> build_config.mk + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD" + PLATFORM_LDFLAGS="-pthread" + PORT_FILE=port/port_posix.cc ;; DragonFly) PLATFORM=OS_DRAGONFLYBSD - echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_DRAGONFLYBSD" >> build_config.mk - echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD" + PLATFORM_LDFLAGS="-lpthread" + PORT_FILE=port/port_posix.cc + ;; + OS_ANDROID_CROSSCOMPILE) + PLATFORM="$TARGET_OS" + COMMON_FLAGS="" + PLATFORM_LDFLAGS="" + PORT_FILE=port/port_android.cc ;; *) echo "Unknown platform!" exit 1 esac -echo "PLATFORM=$PLATFORM" >> build_config.mk +# We want to make a list of all cc files within util, db, table, and helpers +# except for the test and benchmark files. By default, find will output a list +# of all files matching either rule, so we need to append -print to make the +# prune take effect. +DIRS="$SCRIPT_DIR/util $SCRIPT_DIR/db $SCRIPT_DIR/table" +set -f # temporarily disable globbing so that our patterns aren't expanded +PRUNE_TEST="-name *test*.cc -prune" +PRUNE_BENCH="-name *_bench.cc -prune" +PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "` +set +f # re-enable globbing -# On GCC, use libc's memcmp, not GCC's memcmp -PORT_CFLAGS="-fno-builtin-memcmp" +# The sources consist of the portable files, plus the platform-specific port +# file. +echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> build_config.mk +echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> build_config.mk -# Detect C++0x -- this determines whether we'll use port_noatomic.h -# or port_posix.h by: -# 1. Rrying to compile with -std=c++0x and including . -# 2. If $CXX returns error code, we know to use port_posix.h -$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < - int main() {} -EOF -if [ "$?" = 0 ]; then - PORT_CFLAGS="$PORT_CFLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT -std=c++0x" +if [ "$PLATFORM" = "OS_ANDROID_CROSSCOMPILE" ]; then + # Cross-compiling; do not try any compilation tests. + true else - PORT_CFLAGS="$PORT_CFLAGS -DLEVELDB_PLATFORM_POSIX" -fi + # If -std=c++0x works, use . Otherwise use port_posix.h. + $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT" + PLATFORM_CXXFLAGS="-std=c++0x" + else + COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX" + fi -# Test whether Snappy library is installed -# http://code.google.com/p/snappy/ -$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < - int main() {} + # Test whether Snappy library is installed + # http://code.google.com/p/snappy/ + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} EOF -if [ "$?" = 0 ]; then - echo "SNAPPY=1" >> build_config.mk -else - echo "SNAPPY=0" >> build_config.mk + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy" + fi + + # Test whether tcmalloc is available + $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <> build_config.mk +PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" +PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" + +echo "PLATFORM=$PLATFORM" >> build_config.mk +echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> build_config.mk +echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> build_config.mk +echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> build_config.mk diff --git a/port/port_android.h b/port/port_android.h index 64cdcbf..92f0090 100644 --- a/port/port_android.h +++ b/port/port_android.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From bf10812927b9a05a7b671eb32504eaa5972725d7 Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Fri, 30 Mar 2012 20:18:06 +0000 Subject: build shared libraries; updated version to 1.3; add Status accessors git-svn-id: http://leveldb.googlecode.com/svn/trunk@64 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 44 ++++++++++++++++++++++++++++++++------------ build_detect_platform | 11 +++++++++++ db/skiplist.h | 3 ++- include/leveldb/db.h | 3 ++- include/leveldb/status.h | 6 ++++++ 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 2858c6e..354654d 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,10 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. See the AUTHORS file for names of contributors. +# Inherit some settings from environment variables, if available CXX ?= g++ CC ?= gcc +INSTALL_PATH ?= $(CURDIR) #----------------------------------------------- # Uncomment exactly one of the lines labelled (A), (B), and (C) below @@ -19,8 +21,8 @@ $(shell sh ./build_detect_platform) # this file is generated by build_detect_platform to set build flags and sources include build_config.mk -CFLAGS += -c -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) -CXXFLAGS += -c -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) +CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) +CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) LDFLAGS += $(PLATFORM_LDFLAGS) @@ -55,15 +57,33 @@ BENCHMARKS = db_bench_sqlite3 db_bench_tree_db LIBRARY = libleveldb.a MEMENVLIBRARY = libmemenv.a -all: $(LIBRARY) +default: all + +# Should we build shared libraries? +ifneq ($(PLATFORM_SHARED_EXT),) +# Update db.h if you change these. +SHARED_MAJOR = 1 +SHARED_MINOR = 3 +SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) +SHARED2 = $(SHARED1).$(SHARED_MAJOR) +SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) +SHARED = $(SHARED1) $(SHARED2) $(SHARED3) +$(SHARED3): + $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(INSTALL_PATH)/$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3) +$(SHARED2): $(SHARED3) + ln -fs $(SHARED3) $(SHARED2) +$(SHARED1): $(SHARED3) + ln -fs $(SHARED3) $(SHARED1) +endif + +all: $(SHARED) $(LIBRARY) -check: $(PROGRAMS) $(TESTS) +check: all $(PROGRAMS) $(TESTS) for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done clean: - -rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(MEMENVLIBRARY) */*.o */*/*.o ios-x86/*/*.o ios-arm/*/*.o + -rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o ios-x86/*/*.o ios-arm/*/*.o build_config.mk -rm -rf ios-x86/* ios-arm/* - -rm build_config.mk $(LIBRARY): $(LIBOBJECTS) rm -f $@ @@ -142,22 +162,22 @@ IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version .cc.o: mkdir -p ios-x86/$(dir $@) - $(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ + $(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ + $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ .c.o: mkdir -p ios-x86/$(dir $@) - $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 $< -o ios-x86/$@ + $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 $< -o ios-arm/$@ + $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ else .cc.o: - $(CXX) $(CXXFLAGS) $< -o $@ + $(CXX) $(CXXFLAGS) -c $< -o $@ .c.o: - $(CC) $(CFLAGS) $< -o $@ + $(CC) $(CFLAGS) -c $< -o $@ endif diff --git a/build_detect_platform b/build_detect_platform index df85264..64fcaef 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -5,6 +5,9 @@ # # build_config.mk will set the following variables: # PLATFORM_LDFLAGS Linker flags +# PLATFORM_SHARED_EXT Extension for shared libraries +# PLATFORM_SHARED_LDFLAGS Flags for building shared library +# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library # PLATFORM_CCFLAGS C compiler flags # PLATFORM_CXXFLAGS C++ compiler flags. Will contain: # -DLEVELDB_PLATFORM_POSIX if cstdatomic is present @@ -29,12 +32,17 @@ COMMON_FLAGS= PLATFORM_CCFLAGS= PLATFORM_CXXFLAGS= PLATFORM_LDFLAGS= +PLATFORM_SHARED_EXT="so" +PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," +PLATFORM_SHARED_CFLAGS="-fPIC" # On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp case "$TARGET_OS" in Darwin) PLATFORM=OS_MACOSX COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX" + PLATFORM_SHARED_EXT=dylib + PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " PORT_FILE=port/port_posix.cc ;; Linux) @@ -143,3 +151,6 @@ echo "PLATFORM=$PLATFORM" >> build_config.mk echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> build_config.mk echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> build_config.mk echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> build_config.mk +echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> build_config.mk +echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> build_config.mk +echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> build_config.mk diff --git a/db/skiplist.h b/db/skiplist.h index 0481575..af85be6 100644 --- a/db/skiplist.h +++ b/db/skiplist.h @@ -105,7 +105,8 @@ class SkipList { port::AtomicPointer max_height_; // Height of the entire list inline int GetMaxHeight() const { - return reinterpret_cast(max_height_.NoBarrier_Load()); + return static_cast( + reinterpret_cast(max_height_.NoBarrier_Load())); } // Read/written only by Insert(). diff --git a/include/leveldb/db.h b/include/leveldb/db.h index c1182b7..c7d5167 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -12,8 +12,9 @@ namespace leveldb { +// Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 2; +static const int kMinorVersion = 3; struct Options; struct ReadOptions; diff --git a/include/leveldb/status.h b/include/leveldb/status.h index 3355fac..11dbd4b 100644 --- a/include/leveldb/status.h +++ b/include/leveldb/status.h @@ -54,6 +54,12 @@ class Status { // Returns true iff the status indicates a NotFound error. bool IsNotFound() const { return code() == kNotFound; } + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { return code() == kCorruption; } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { return code() == kIOError; } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; -- cgit v1.2.3 From 99a7585544fc162a5f8dd39a6add00776a981efe Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Tue, 17 Apr 2012 15:48:25 +0000 Subject: Added bloom filter support git-svn-id: http://leveldb.googlecode.com/svn/trunk@65 62dab493-f737-651d-591e-8d6aee1b9529 --- .gitignore | 3 + Makefile | 14 +- build_detect_platform | 38 +- db/c.cc | 110 ++++ db/c_test.c | 77 +++ db/db_bench.cc | 99 +++- db/db_impl.cc | 12 +- db/db_impl.h | 2 + db/db_test.cc | 1050 +++++++++++++++++++++++---------------- db/dbformat.cc | 20 + db/dbformat.h | 12 + db/repair.cc | 4 +- db/table_cache.cc | 58 ++- db/table_cache.h | 11 + db/version_set.cc | 83 ++-- doc/index.html | 63 +++ doc/table_format.txt | 41 ++ include/leveldb/c.h | 29 ++ include/leveldb/db.h | 2 +- include/leveldb/filter_policy.h | 70 +++ include/leveldb/options.h | 8 + include/leveldb/table.h | 15 + include/leveldb/table_builder.h | 1 + port/port_android.h | 3 + table/block.cc | 9 +- table/block.h | 5 +- table/filter_block.cc | 111 +++++ table/filter_block.h | 68 +++ table/filter_block_test.cc | 128 +++++ table/format.cc | 23 +- table/format.h | 16 +- table/table.cc | 116 ++++- table/table_builder.cc | 55 +- table/table_test.cc | 32 +- util/bloom.cc | 95 ++++ util/bloom_test.cc | 159 ++++++ util/filter_policy.cc | 11 + util/options.cc | 3 +- 38 files changed, 2078 insertions(+), 578 deletions(-) create mode 100644 include/leveldb/filter_policy.h create mode 100644 table/filter_block.cc create mode 100644 table/filter_block.h create mode 100644 table/filter_block_test.cc create mode 100644 util/bloom.cc create mode 100644 util/bloom_test.cc create mode 100644 util/filter_policy.cc diff --git a/.gitignore b/.gitignore index 46769e0..f030430 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ build_config.mk *.a *.o +*.dylib* +*.so +*.so.* *_test db_bench diff --git a/Makefile b/Makefile index 354654d..b961ba1 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,8 @@ OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode) #----------------------------------------------- # detect what platform we're building on -$(shell sh ./build_detect_platform) -# this file is generated by build_detect_platform to set build flags and sources +$(shell ./build_detect_platform build_config.mk) +# this file is generated by the previous line to set build flags and sources include build_config.mk CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) @@ -34,6 +34,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL) TESTS = \ arena_test \ + bloom_test \ c_test \ cache_test \ coding_test \ @@ -43,6 +44,7 @@ TESTS = \ dbformat_test \ env_test \ filename_test \ + filter_block_test \ log_test \ memenv_test \ skiplist_test \ @@ -63,7 +65,7 @@ default: all ifneq ($(PLATFORM_SHARED_EXT),) # Update db.h if you change these. SHARED_MAJOR = 1 -SHARED_MINOR = 3 +SHARED_MINOR = 4 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) @@ -101,6 +103,9 @@ db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) +bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) + c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) @@ -128,6 +133,9 @@ env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) +filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) + log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) diff --git a/build_detect_platform b/build_detect_platform index 64fcaef..b71bf02 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -1,9 +1,9 @@ #!/bin/sh # -# Detects OS we're compiling on and generates build_config.mk, -# which in turn gets read while processing Makefile. +# Detects OS we're compiling on and outputs a file specified by the first +# argument, which in turn gets read while processing Makefile. # -# build_config.mk will set the following variables: +# The output will set the following variables: # PLATFORM_LDFLAGS Linker flags # PLATFORM_SHARED_EXT Extension for shared libraries # PLATFORM_SHARED_LDFLAGS Flags for building shared library @@ -13,11 +13,15 @@ # -DLEVELDB_PLATFORM_POSIX if cstdatomic is present # -DLEVELDB_PLATFORM_NOATOMIC if it is not -SCRIPT_DIR=`dirname $0` +OUTPUT=$1 +if test -z "$OUTPUT"; then + echo "usage: $0 " + exit 1 +fi -# Delete existing build_config.mk -rm -f build_config.mk -touch build_config.mk +# Delete existing output, if it exists +rm -f $OUTPUT +touch $OUTPUT if test -z "$CXX"; then CXX=g++ @@ -96,7 +100,7 @@ esac # except for the test and benchmark files. By default, find will output a list # of all files matching either rule, so we need to append -print to make the # prune take effect. -DIRS="$SCRIPT_DIR/util $SCRIPT_DIR/db $SCRIPT_DIR/table" +DIRS="util db table" set -f # temporarily disable globbing so that our patterns aren't expanded PRUNE_TEST="-name *test*.cc -prune" PRUNE_BENCH="-name *_bench.cc -prune" @@ -105,8 +109,8 @@ set +f # re-enable globbing # The sources consist of the portable files, plus the platform-specific port # file. -echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> build_config.mk -echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> build_config.mk +echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT +echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT if [ "$PLATFORM" = "OS_ANDROID_CROSSCOMPILE" ]; then # Cross-compiling; do not try any compilation tests. @@ -147,10 +151,10 @@ fi PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" -echo "PLATFORM=$PLATFORM" >> build_config.mk -echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> build_config.mk -echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> build_config.mk -echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> build_config.mk -echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> build_config.mk -echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> build_config.mk -echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> build_config.mk +echo "PLATFORM=$PLATFORM" >> $OUTPUT +echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT +echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT +echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT +echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT diff --git a/db/c.cc b/db/c.cc index 038e5c0..2dde400 100644 --- a/db/c.cc +++ b/db/c.cc @@ -10,6 +10,7 @@ #include "leveldb/comparator.h" #include "leveldb/db.h" #include "leveldb/env.h" +#include "leveldb/filter_policy.h" #include "leveldb/iterator.h" #include "leveldb/options.h" #include "leveldb/status.h" @@ -21,8 +22,10 @@ using leveldb::CompressionType; using leveldb::DB; using leveldb::Env; using leveldb::FileLock; +using leveldb::FilterPolicy; using leveldb::Iterator; using leveldb::Logger; +using leveldb::NewBloomFilterPolicy; using leveldb::NewLRUCache; using leveldb::Options; using leveldb::RandomAccessFile; @@ -78,6 +81,47 @@ struct leveldb_comparator_t : public Comparator { virtual void FindShortSuccessor(std::string* key) const { } }; +struct leveldb_filterpolicy_t : public FilterPolicy { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*create_)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length); + unsigned char (*key_match_)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length); + + virtual ~leveldb_filterpolicy_t() { + (*destructor_)(state_); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + std::vector key_pointers(n); + std::vector key_sizes(n); + for (int i = 0; i < n; i++) { + key_pointers[i] = keys[i].data(); + key_sizes[i] = keys[i].size(); + } + size_t len; + char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len); + dst->append(filter, len); + free(filter); + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + return (*key_match_)(state_, key.data(), key.size(), + filter.data(), filter.size()); + } +}; + struct leveldb_env_t { Env* rep; bool is_default; @@ -218,6 +262,17 @@ void leveldb_approximate_sizes( delete[] ranges; } +void leveldb_compact_range( + leveldb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + // Pass NULL Slice if corresponding "const char*" is NULL + (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL)); +} + void leveldb_destroy_db( const leveldb_options_t* options, const char* name, @@ -340,6 +395,12 @@ void leveldb_options_set_comparator( opt->rep.comparator = cmp; } +void leveldb_options_set_filter_policy( + leveldb_options_t* opt, + leveldb_filterpolicy_t* policy) { + opt->rep.filter_policy = policy; +} + void leveldb_options_set_create_if_missing( leveldb_options_t* opt, unsigned char v) { opt->rep.create_if_missing = v; @@ -407,6 +468,55 @@ void leveldb_comparator_destroy(leveldb_comparator_t* cmp) { delete cmp; } +leveldb_filterpolicy_t* leveldb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + const char* (*name)(void*)) { + leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t; + result->state_ = state; + result->destructor_ = destructor; + result->create_ = create_filter; + result->key_match_ = key_may_match; + result->name_ = name; + return result; +} + +void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) { + delete filter; +} + +leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) { + // Make a leveldb_filterpolicy_t, but override all of its methods so + // they delegate to a NewBloomFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public leveldb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() { delete rep_; } + const char* Name() const { return rep_->Name(); } + void CreateFilter(const Slice* keys, int n, std::string* dst) const { + return rep_->CreateFilter(keys, n, dst); + } + bool KeyMayMatch(const Slice& key, const Slice& filter) const { + return rep_->KeyMayMatch(key, filter); + } + static void DoNothing(void*) { } + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = NewBloomFilterPolicy(bits_per_key); + wrapper->state_ = NULL; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + leveldb_readoptions_t* leveldb_readoptions_create() { return new leveldb_readoptions_t; } diff --git a/db/c_test.c b/db/c_test.c index 9fef325..12b4424 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -122,6 +122,31 @@ static const char* CmpName(void* arg) { return "foo"; } +// Custom filter policy +static unsigned char fake_filter_result = 1; +static void FilterDestroy(void* arg) { } +static const char* FilterName(void* arg) { + return "TestFilter"; +} +static char* FilterCreate( + void* arg, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length) { + *filter_length = 4; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +unsigned char FilterKeyMatch( + void* arg, + const char* key, size_t length, + const char* filter, size_t filter_length) { + CheckCondition(filter_length == 4); + CheckCondition(memcmp(filter, "fake", 4) == 0); + return fake_filter_result; +} + int main(int argc, char** argv) { leveldb_t* db; leveldb_comparator_t* cmp; @@ -131,6 +156,7 @@ int main(int argc, char** argv) { leveldb_readoptions_t* roptions; leveldb_writeoptions_t* woptions; char* err = NULL; + int run = -1; snprintf(dbname, sizeof(dbname), "/tmp/leveldb_c_test-%d", ((int) geteuid())); @@ -180,6 +206,14 @@ int main(int argc, char** argv) { CheckNoError(err); CheckGet(db, roptions, "foo", "hello"); + StartPhase("compactall"); + leveldb_compact_range(db, NULL, 0, NULL, 0); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactrange"); + leveldb_compact_range(db, "a", 1, "z", 1); + CheckGet(db, roptions, "foo", "hello"); + StartPhase("writebatch"); { leveldb_writebatch_t* wb = leveldb_writebatch_create(); @@ -279,6 +313,49 @@ int main(int argc, char** argv) { CheckGet(db, roptions, "foo", NULL); CheckGet(db, roptions, "bar", NULL); CheckGet(db, roptions, "box", "c"); + leveldb_options_set_create_if_missing(options, 1); + leveldb_options_set_error_if_exists(options, 1); + } + + StartPhase("filter"); + for (run = 0; run < 2; run++) { + // First run uses custom filter, second run uses bloom filter + CheckNoError(err); + leveldb_filterpolicy_t* policy; + if (run == 0) { + policy = leveldb_filterpolicy_create( + NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName); + } else { + policy = leveldb_filterpolicy_create_bloom(10); + } + + // Create new database + leveldb_close(db); + leveldb_destroy_db(options, dbname, &err); + leveldb_options_set_filter_policy(options, policy); + db = leveldb_open(options, dbname, &err); + CheckNoError(err); + leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + leveldb_compact_range(db, NULL, 0, NULL, 0); + + fake_filter_result = 1; + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + if (phase == 0) { + // Must not find value when custom filter returns false + fake_filter_result = 0; + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + fake_filter_result = 1; + + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + } + leveldb_options_set_filter_policy(options, NULL); + leveldb_filterpolicy_destroy(policy); } StartPhase("cleanup"); diff --git a/db/db_bench.cc b/db/db_bench.cc index bbfd618..b0c3995 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -25,15 +25,20 @@ // overwrite -- overwrite N values in random key order in async mode // fillsync -- write N/100 values in random key order in sync mode // fill100K -- write N/1000 100K values in random order in async mode +// deleteseq -- delete N keys in sequential order +// deleterandom -- delete N keys in random order // readseq -- read N times sequentially // readreverse -- read N times in reverse order // readrandom -- read N times in random order +// readmissing -- read N missing keys in random order // readhot -- read N times in random order from 1% section of DB +// seekrandom -- N random seeks // crc32c -- repeated crc32c of 4K of data // acquireload -- load N*1000 times // Meta operations: // compact -- Compact the entire DB // stats -- Print DB stats +// sstables -- Print sstable info // heapprofile -- Dump a heap profile (if supported by this port) static const char* FLAGS_benchmarks = "fillseq," @@ -85,6 +90,10 @@ static int FLAGS_cache_size = -1; // Maximum number of files to keep open at the same time (use default if == 0) static int FLAGS_open_files = 0; +// Bloom filter bits per key. +// Negative means use default settings. +static int FLAGS_bloom_bits = -1; + // If true, do not destroy the existing database. If you set this // flag and also specify a benchmark that wants a fresh database, that // benchmark will fail. @@ -293,6 +302,7 @@ struct ThreadState { class Benchmark { private: Cache* cache_; + const FilterPolicy* filter_policy_; DB* db_; int num_; int value_size_; @@ -378,6 +388,9 @@ class Benchmark { public: Benchmark() : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), + filter_policy_(FLAGS_bloom_bits >= 0 + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : NULL), db_(NULL), num_(FLAGS_num), value_size_(FLAGS_value_size), @@ -399,6 +412,7 @@ class Benchmark { ~Benchmark() { delete db_; delete cache_; + delete filter_policy_; } void Run() { @@ -457,11 +471,19 @@ class Benchmark { method = &Benchmark::ReadReverse; } else if (name == Slice("readrandom")) { method = &Benchmark::ReadRandom; + } else if (name == Slice("readmissing")) { + method = &Benchmark::ReadMissing; + } else if (name == Slice("seekrandom")) { + method = &Benchmark::SeekRandom; } else if (name == Slice("readhot")) { method = &Benchmark::ReadHot; } else if (name == Slice("readrandomsmall")) { reads_ /= 1000; method = &Benchmark::ReadRandom; + } else if (name == Slice("deleteseq")) { + method = &Benchmark::DeleteSeq; + } else if (name == Slice("deleterandom")) { + method = &Benchmark::DeleteRandom; } else if (name == Slice("readwhilewriting")) { num_threads++; // Add extra thread for writing method = &Benchmark::ReadWhileWriting; @@ -478,7 +500,9 @@ class Benchmark { } else if (name == Slice("heapprofile")) { HeapProfile(); } else if (name == Slice("stats")) { - PrintStats(); + PrintStats("leveldb.stats"); + } else if (name == Slice("sstables")) { + PrintStats("leveldb.sstables"); } else { if (name != Slice()) { // No error message for empty name fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); @@ -669,6 +693,7 @@ class Benchmark { options.create_if_missing = !FLAGS_use_existing_db; options.block_cache = cache_; options.write_buffer_size = FLAGS_write_buffer_size; + options.filter_policy = filter_policy_; Status s = DB::Open(options, FLAGS_db, &db_); if (!s.ok()) { fprintf(stderr, "open error: %s\n", s.ToString().c_str()); @@ -743,10 +768,28 @@ class Benchmark { void ReadRandom(ThreadState* thread) { ReadOptions options; std::string value; + int found = 0; for (int i = 0; i < reads_; i++) { char key[100]; const int k = thread->rand.Next() % FLAGS_num; snprintf(key, sizeof(key), "%016d", k); + if (db_->Get(options, key, &value).ok()) { + found++; + } + thread->stats.FinishedSingleOp(); + } + char msg[100]; + snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_); + thread->stats.AddMessage(msg); + } + + void ReadMissing(ThreadState* thread) { + ReadOptions options; + std::string value; + for (int i = 0; i < reads_; i++) { + char key[100]; + const int k = thread->rand.Next() % FLAGS_num; + snprintf(key, sizeof(key), "%016d.", k); db_->Get(options, key, &value); thread->stats.FinishedSingleOp(); } @@ -765,6 +808,54 @@ class Benchmark { } } + void SeekRandom(ThreadState* thread) { + ReadOptions options; + std::string value; + int found = 0; + for (int i = 0; i < reads_; i++) { + Iterator* iter = db_->NewIterator(options); + char key[100]; + const int k = thread->rand.Next() % FLAGS_num; + snprintf(key, sizeof(key), "%016d", k); + iter->Seek(key); + if (iter->Valid() && iter->key() == key) found++; + delete iter; + thread->stats.FinishedSingleOp(); + } + char msg[100]; + snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_); + thread->stats.AddMessage(msg); + } + + void DoDelete(ThreadState* thread, bool seq) { + RandomGenerator gen; + WriteBatch batch; + Status s; + for (int i = 0; i < num_; i += entries_per_batch_) { + batch.Clear(); + for (int j = 0; j < entries_per_batch_; j++) { + const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num); + char key[100]; + snprintf(key, sizeof(key), "%016d", k); + batch.Delete(key); + thread->stats.FinishedSingleOp(); + } + s = db_->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "del error: %s\n", s.ToString().c_str()); + exit(1); + } + } + } + + void DeleteSeq(ThreadState* thread) { + DoDelete(thread, true); + } + + void DeleteRandom(ThreadState* thread) { + DoDelete(thread, false); + } + void ReadWhileWriting(ThreadState* thread) { if (thread->tid > 0) { ReadRandom(thread); @@ -799,9 +890,9 @@ class Benchmark { db_->CompactRange(NULL, NULL); } - void PrintStats() { + void PrintStats(const char* key) { std::string stats; - if (!db_->GetProperty("leveldb.stats", &stats)) { + if (!db_->GetProperty(key, &stats)) { stats = "(failed)"; } fprintf(stdout, "\n%s\n", stats.c_str()); @@ -861,6 +952,8 @@ int main(int argc, char** argv) { FLAGS_write_buffer_size = n; } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { FLAGS_cache_size = n; + } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) { + FLAGS_bloom_bits = n; } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) { FLAGS_open_files = n; } else if (strncmp(argv[i], "--db=", 5) == 0) { diff --git a/db/db_impl.cc b/db/db_impl.cc index 88d17e7..c9c9023 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -87,12 +87,14 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) { } Options SanitizeOptions(const std::string& dbname, const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, const Options& src) { Options result = src; result.comparator = icmp; - ClipToRange(&result.max_open_files, 20, 50000); - ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); - ClipToRange(&result.block_size, 1<<10, 4<<20); + result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL; + ClipToRange(&result.max_open_files, 20, 50000); + ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); if (result.info_log == NULL) { // Open a log file in the same directory as the db src.env->CreateDir(dbname); // In case it does not exist @@ -112,7 +114,9 @@ Options SanitizeOptions(const std::string& dbname, DBImpl::DBImpl(const Options& options, const std::string& dbname) : env_(options.env), internal_comparator_(options.comparator), - options_(SanitizeOptions(dbname, &internal_comparator_, options)), + internal_filter_policy_(options.filter_policy), + options_(SanitizeOptions( + dbname, &internal_comparator_, &internal_filter_policy_, options)), owns_info_log_(options_.info_log != options.info_log), owns_cache_(options_.block_cache != options.block_cache), dbname_(dbname), diff --git a/db/db_impl.h b/db/db_impl.h index e665c0e..2f8b523 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -105,6 +105,7 @@ class DBImpl : public DB { // Constant after construction Env* const env_; const InternalKeyComparator internal_comparator_; + const InternalFilterPolicy internal_filter_policy_; const Options options_; // options_.comparator == &internal_comparator_ bool owns_info_log_; bool owns_cache_; @@ -185,6 +186,7 @@ class DBImpl : public DB { // it is not equal to src.info_log. extern Options SanitizeOptions(const std::string& db, const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, const Options& src); } // namespace leveldb diff --git a/db/db_test.cc b/db/db_test.cc index 8318885..ee10807 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3,12 +3,15 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "leveldb/db.h" +#include "leveldb/filter_policy.h" #include "db/db_impl.h" #include "db/filename.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "leveldb/cache.h" #include "leveldb/env.h" #include "leveldb/table.h" +#include "util/hash.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/testharness.h" @@ -22,6 +25,28 @@ static std::string RandomString(Random* rnd, int len) { return r; } +namespace { +class AtomicCounter { + private: + port::Mutex mu_; + int count_; + public: + AtomicCounter() : count_(0) { } + void Increment() { + MutexLock l(&mu_); + count_++; + } + int Read() { + MutexLock l(&mu_); + return count_; + } + void Reset() { + MutexLock l(&mu_); + count_ = 0; + } +}; +} + // Special Env used to delay background operations class SpecialEnv : public EnvWrapper { public: @@ -31,9 +56,13 @@ class SpecialEnv : public EnvWrapper { // Simulate no-space errors while this pointer is non-NULL. port::AtomicPointer no_space_; + bool count_random_reads_; + AtomicCounter random_read_counter_; + explicit SpecialEnv(Env* base) : EnvWrapper(base) { delay_sstable_sync_.Release_Store(NULL); no_space_.Release_Store(NULL); + count_random_reads_ = false; } Status NewWritableFile(const std::string& f, WritableFile** r) { @@ -74,9 +103,44 @@ class SpecialEnv : public EnvWrapper { } return s; } + + Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { + class CountingFile : public RandomAccessFile { + private: + RandomAccessFile* target_; + AtomicCounter* counter_; + public: + CountingFile(RandomAccessFile* target, AtomicCounter* counter) + : target_(target), counter_(counter) { + } + virtual ~CountingFile() { delete target_; } + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + counter_->Increment(); + return target_->Read(offset, n, result, scratch); + } + }; + + Status s = target()->NewRandomAccessFile(f, r); + if (s.ok() && count_random_reads_) { + *r = new CountingFile(*r, &random_read_counter_); + } + return s; + } }; class DBTest { + private: + const FilterPolicy* filter_policy_; + + // Sequence of option configurations to try + enum OptionConfig { + kDefault, + kFilter, + kEnd + }; + int option_config_; + public: std::string dbname_; SpecialEnv* env_; @@ -84,7 +148,9 @@ class DBTest { Options last_options_; - DBTest() : env_(new SpecialEnv(Env::Default())) { + DBTest() : option_config_(kDefault), + env_(new SpecialEnv(Env::Default())) { + filter_policy_ = NewBloomFilterPolicy(10); dbname_ = test::TmpDir() + "/db_test"; DestroyDB(dbname_, Options()); db_ = NULL; @@ -95,6 +161,32 @@ class DBTest { delete db_; DestroyDB(dbname_, Options()); delete env_; + delete filter_policy_; + } + + // Switch to a fresh database with the next option configuration to + // test. Return false if there are no more configurations to test. + bool ChangeOptions() { + if (option_config_ == kEnd) { + return false; + } else { + option_config_++; + DestroyAndReopen(); + return true; + } + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + switch (option_config_) { + case kFilter: + options.filter_policy = filter_policy_; + break; + default: + break; + } + return options; } DBImpl* dbfull() { @@ -105,6 +197,11 @@ class DBTest { ASSERT_OK(TryReopen(options)); } + void Close() { + delete db_; + db_ = NULL; + } + void DestroyAndReopen(Options* options = NULL) { delete db_; db_ = NULL; @@ -119,6 +216,7 @@ class DBTest { if (options != NULL) { opts = *options; } else { + opts = CurrentOptions(); opts.create_if_missing = true; } last_options_ = opts; @@ -189,8 +287,7 @@ class DBTest { if (!ParseInternalKey(iter->key(), &ikey)) { result += "CORRUPTED"; } else { - if (last_options_.comparator->Compare( - ikey.user_key, user_key) != 0) { + if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) { break; } if (!first) { @@ -314,135 +411,155 @@ class DBTest { }; TEST(DBTest, Empty) { - ASSERT_TRUE(db_ != NULL); - ASSERT_EQ("NOT_FOUND", Get("foo")); + do { + ASSERT_TRUE(db_ != NULL); + ASSERT_EQ("NOT_FOUND", Get("foo")); + } while (ChangeOptions()); } TEST(DBTest, ReadWrite) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v2", Get("bar")); + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + } while (ChangeOptions()); } TEST(DBTest, PutDeleteGet) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - ASSERT_OK(db_->Delete(WriteOptions(), "foo")); - ASSERT_EQ("NOT_FOUND", Get("foo")); + do { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(db_->Delete(WriteOptions(), "foo")); + ASSERT_EQ("NOT_FOUND", Get("foo")); + } while (ChangeOptions()); } TEST(DBTest, GetFromImmutableLayer) { - Options options; - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - Reopen(&options); + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); - env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls - Put("k1", std::string(100000, 'x')); // Fill memtable - Put("k2", std::string(100000, 'y')); // Trigger compaction - ASSERT_EQ("v1", Get("foo")); - env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put("k1", std::string(100000, 'x')); // Fill memtable + Put("k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls + } while (ChangeOptions()); } TEST(DBTest, GetFromVersions) { - ASSERT_OK(Put("foo", "v1")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("v1", Get("foo")); + do { + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("v1", Get("foo")); + } while (ChangeOptions()); } TEST(DBTest, GetSnapshot) { - // Try with both a short key and a long key - for (int i = 0; i < 2; i++) { - std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); - ASSERT_OK(Put(key, "v1")); - const Snapshot* s1 = db_->GetSnapshot(); - ASSERT_OK(Put(key, "v2")); - ASSERT_EQ("v2", Get(key)); - ASSERT_EQ("v1", Get(key, s1)); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("v2", Get(key)); - ASSERT_EQ("v1", Get(key, s1)); - db_->ReleaseSnapshot(s1); - } + do { + // Try with both a short key and a long key + for (int i = 0; i < 2; i++) { + std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); + ASSERT_OK(Put(key, "v1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Put(key, "v2")); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + db_->ReleaseSnapshot(s1); + } + } while (ChangeOptions()); } TEST(DBTest, GetLevel0Ordering) { - // Check that we process level-0 files in correct order. The code - // below generates two level-0 files where the earlier one comes - // before the later one in the level-0 file list since the earlier - // one has a smaller "smallest" key. - ASSERT_OK(Put("bar", "b")); - ASSERT_OK(Put("foo", "v1")); - dbfull()->TEST_CompactMemTable(); - ASSERT_OK(Put("foo", "v2")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("v2", Get("foo")); + do { + // Check that we process level-0 files in correct order. The code + // below generates two level-0 files where the earlier one comes + // before the later one in the level-0 file list since the earlier + // one has a smaller "smallest" key. + ASSERT_OK(Put("bar", "b")); + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_CompactMemTable(); + ASSERT_OK(Put("foo", "v2")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("v2", Get("foo")); + } while (ChangeOptions()); } TEST(DBTest, GetOrderedByLevels) { - ASSERT_OK(Put("foo", "v1")); - Compact("a", "z"); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("v2", Get("foo")); + do { + ASSERT_OK(Put("foo", "v1")); + Compact("a", "z"); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("v2", Get("foo")); + } while (ChangeOptions()); } TEST(DBTest, GetPicksCorrectFile) { - // Arrange to have multiple files in a non-level-0 level. - ASSERT_OK(Put("a", "va")); - Compact("a", "b"); - ASSERT_OK(Put("x", "vx")); - Compact("x", "y"); - ASSERT_OK(Put("f", "vf")); - Compact("f", "g"); - ASSERT_EQ("va", Get("a")); - ASSERT_EQ("vf", Get("f")); - ASSERT_EQ("vx", Get("x")); + do { + // Arrange to have multiple files in a non-level-0 level. + ASSERT_OK(Put("a", "va")); + Compact("a", "b"); + ASSERT_OK(Put("x", "vx")); + Compact("x", "y"); + ASSERT_OK(Put("f", "vf")); + Compact("f", "g"); + ASSERT_EQ("va", Get("a")); + ASSERT_EQ("vf", Get("f")); + ASSERT_EQ("vx", Get("x")); + } while (ChangeOptions()); } TEST(DBTest, GetEncountersEmptyLevel) { - // Arrange for the following to happen: - // * sstable A in level 0 - // * nothing in level 1 - // * sstable B in level 2 - // Then do enough Get() calls to arrange for an automatic compaction - // of sstable A. A bug would cause the compaction to be marked as - // occuring at level 1 (instead of the correct level 0). - - // Step 1: First place sstables in levels 0 and 2 - int compaction_count = 0; - while (NumTableFilesAtLevel(0) == 0 || - NumTableFilesAtLevel(2) == 0) { - ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2"; - compaction_count++; - Put("a", "begin"); - Put("z", "end"); - dbfull()->TEST_CompactMemTable(); - } - - // Step 2: clear level 1 if necessary. - dbfull()->TEST_CompactRange(1, NULL, NULL); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); + do { + // Arrange for the following to happen: + // * sstable A in level 0 + // * nothing in level 1 + // * sstable B in level 2 + // Then do enough Get() calls to arrange for an automatic compaction + // of sstable A. A bug would cause the compaction to be marked as + // occuring at level 1 (instead of the correct level 0). + + // Step 1: First place sstables in levels 0 and 2 + int compaction_count = 0; + while (NumTableFilesAtLevel(0) == 0 || + NumTableFilesAtLevel(2) == 0) { + ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2"; + compaction_count++; + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_CompactMemTable(); + } - // Step 3: read until level 0 compaction disappears. - int read_count = 0; - while (NumTableFilesAtLevel(0) > 0) { - ASSERT_LE(read_count, 10000) << "did not trigger level 0 compaction"; - read_count++; - ASSERT_EQ("NOT_FOUND", Get("missing")); - } + // Step 2: clear level 1 if necessary. + dbfull()->TEST_CompactRange(1, NULL, NULL); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); + + // Step 3: read until level 0 compaction disappears. + int read_count = 0; + while (NumTableFilesAtLevel(0) > 0) { + ASSERT_LE(read_count, 10000) << "did not trigger level 0 compaction"; + read_count++; + ASSERT_EQ("NOT_FOUND", Get("missing")); + } + } while (ChangeOptions()); } TEST(DBTest, IterEmpty) { @@ -620,69 +737,77 @@ TEST(DBTest, IterSmallAndLargeMix) { } TEST(DBTest, IterMultiWithDelete) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", "vb")); - ASSERT_OK(Put("c", "vc")); - ASSERT_OK(Delete("b")); - ASSERT_EQ("NOT_FOUND", Get("b")); + do { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Delete("b")); + ASSERT_EQ("NOT_FOUND", Get("b")); - Iterator* iter = db_->NewIterator(ReadOptions()); - iter->Seek("c"); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - delete iter; + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->Seek("c"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + delete iter; + } while (ChangeOptions()); } TEST(DBTest, Recover) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("baz", "v5")); + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); - Reopen(); - ASSERT_EQ("v1", Get("foo")); + Reopen(); + ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v5", Get("baz")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v5", Get("baz")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); - Reopen(); - ASSERT_EQ("v3", Get("foo")); - ASSERT_OK(Put("foo", "v4")); - ASSERT_EQ("v4", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ("v5", Get("baz")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); + ASSERT_OK(Put("foo", "v4")); + ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v5", Get("baz")); + } while (ChangeOptions()); } TEST(DBTest, RecoveryWithEmptyLog) { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("foo", "v2")); - Reopen(); - Reopen(); - ASSERT_OK(Put("foo", "v3")); - Reopen(); - ASSERT_EQ("v3", Get("foo")); + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + Reopen(); + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); + } while (ChangeOptions()); } // Check that writes done during a memtable compaction are recovered // if the database is shutdown during the memtable compaction. TEST(DBTest, RecoverDuringMemtableCompaction) { - Options options; - options.env = env_; - options.write_buffer_size = 1000000; - Reopen(&options); + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 1000000; + Reopen(&options); - // Trigger a long memtable compaction and reopen the database during it - ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file - ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable - ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction - ASSERT_OK(Put("bar", "v2")); // Goes to new log file + // Trigger a long memtable compaction and reopen the database during it + ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put("bar", "v2")); // Goes to new log file - Reopen(&options); - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ(std::string(10000000, 'x'), Get("big1")); - ASSERT_EQ(std::string(1000, 'y'), Get("big2")); + Reopen(&options); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get("big1")); + ASSERT_EQ(std::string(1000, 'y'), Get("big2")); + } while (ChangeOptions()); } static std::string Key(int i) { @@ -692,7 +817,7 @@ static std::string Key(int i) { } TEST(DBTest, MinorCompactionsHappen) { - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 10000; Reopen(&options); @@ -718,7 +843,7 @@ TEST(DBTest, MinorCompactionsHappen) { TEST(DBTest, RecoverWithLargeLog) { { - Options options; + Options options = CurrentOptions(); Reopen(&options); ASSERT_OK(Put("big1", std::string(200000, '1'))); ASSERT_OK(Put("big2", std::string(200000, '2'))); @@ -729,7 +854,7 @@ TEST(DBTest, RecoverWithLargeLog) { // Make sure that if we re-open with a small write buffer size that // we flush table files in the middle of a large log file. - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 100000; Reopen(&options); ASSERT_EQ(NumTableFilesAtLevel(0), 3); @@ -741,7 +866,7 @@ TEST(DBTest, RecoverWithLargeLog) { } TEST(DBTest, CompactionsGenerateMultipleFiles) { - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 100000000; // Large write buffer Reopen(&options); @@ -767,7 +892,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { } TEST(DBTest, RepeatedWritesToSameKey) { - Options options; + Options options = CurrentOptions(); options.env = env_; options.write_buffer_size = 100000; // Small write buffer Reopen(&options); @@ -786,7 +911,7 @@ TEST(DBTest, RepeatedWritesToSameKey) { } TEST(DBTest, SparseMerge) { - Options options; + Options options = CurrentOptions(); options.compression = kNoCompression; Reopen(&options); @@ -837,87 +962,91 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) { } TEST(DBTest, ApproximateSizes) { - Options options; - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - DestroyAndReopen(); + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + DestroyAndReopen(); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - Reopen(&options); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + Reopen(&options); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - const int N = 80; - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); - } + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + const int N = 80; + static const int S1 = 100000; + static const int S2 = 105000; // Allow some expansion from metadata + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, S1))); + } - // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); - for (int compact_start = 0; compact_start < N; compact_start += 10) { - for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); - ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), - 100000 * (i+1), 100000 * (i+1) + 10000)); - ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), - 100000 * 10, 100000 * 10 + 10000)); + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i)); + ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1))); + ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10)); + } + ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50)); + ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50)); + + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend); } - ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); - ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); - - std::string cstart_str = Key(compact_start); - std::string cend_str = Key(compact_start + 9); - Slice cstart = cstart_str; - Slice cend = cend_str; - dbfull()->TEST_CompactRange(0, &cstart, &cend); - } - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 0); - } + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + } + } while (ChangeOptions()); } TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { - Options options; - options.compression = kNoCompression; - Reopen(); - - Random rnd(301); - std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(2), big1)); - ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(4), big1)); - ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); - - ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + Reopen(); - dbfull()->TEST_CompactRange(0, NULL, NULL); - } + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(2), big1)); + ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(4), big1)); + ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, NULL, NULL); + } + } while (ChangeOptions()); } TEST(DBTest, IteratorPinsRef) { @@ -943,59 +1072,63 @@ TEST(DBTest, IteratorPinsRef) { } TEST(DBTest, Snapshot) { - Put("foo", "v1"); - const Snapshot* s1 = db_->GetSnapshot(); - Put("foo", "v2"); - const Snapshot* s2 = db_->GetSnapshot(); - Put("foo", "v3"); - const Snapshot* s3 = db_->GetSnapshot(); - - Put("foo", "v4"); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v3", Get("foo", s3)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s3); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s1); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s2); - ASSERT_EQ("v4", Get("foo")); -} + do { + Put("foo", "v1"); + const Snapshot* s1 = db_->GetSnapshot(); + Put("foo", "v2"); + const Snapshot* s2 = db_->GetSnapshot(); + Put("foo", "v3"); + const Snapshot* s3 = db_->GetSnapshot(); + + Put("foo", "v4"); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v3", Get("foo", s3)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s3); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); -TEST(DBTest, HiddenValuesAreRemoved) { - Random rnd(301); - FillLevels("a", "z"); + db_->ReleaseSnapshot(s1); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); - std::string big = RandomString(&rnd, 50000); - Put("foo", big); - Put("pastfoo", "v"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put("foo", "tiny"); - Put("pastfoo2", "v2"); // Advance sequence number one more + db_->ReleaseSnapshot(s2); + ASSERT_EQ("v4", Get("foo")); + } while (ChangeOptions()); +} - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_GT(NumTableFilesAtLevel(0), 0); - - ASSERT_EQ(big, Get("foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); - db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); - Slice x("x"); - dbfull()->TEST_CompactRange(0, NULL, &x); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GE(NumTableFilesAtLevel(1), 1); - dbfull()->TEST_CompactRange(1, NULL, &x); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); +TEST(DBTest, HiddenValuesAreRemoved) { + do { + Random rnd(301); + FillLevels("a", "z"); + + std::string big = RandomString(&rnd, 50000); + Put("foo", big); + Put("pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "tiny"); + Put("pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + + ASSERT_EQ(big, Get("foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, NULL, &x); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GE(NumTableFilesAtLevel(1), 1); + dbfull()->TEST_CompactRange(1, NULL, &x); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); + ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); + } while (ChangeOptions()); } TEST(DBTest, DeletionMarkers1) { @@ -1054,85 +1187,87 @@ TEST(DBTest, DeletionMarkers2) { } TEST(DBTest, OverlapInLevel0) { - ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config"; + do { + ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config"; - // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. - ASSERT_OK(Put("100", "v100")); - ASSERT_OK(Put("999", "v999")); - dbfull()->TEST_CompactMemTable(); - ASSERT_OK(Delete("100")); - ASSERT_OK(Delete("999")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("0,1,1", FilesPerLevel()); - - // Make files spanning the following ranges in level-0: - // files[0] 200 .. 900 - // files[1] 300 .. 500 - // Note that files are sorted by smallest key. - ASSERT_OK(Put("300", "v300")); - ASSERT_OK(Put("500", "v500")); - dbfull()->TEST_CompactMemTable(); - ASSERT_OK(Put("200", "v200")); - ASSERT_OK(Put("600", "v600")); - ASSERT_OK(Put("900", "v900")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("2,1,1", FilesPerLevel()); + // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. + ASSERT_OK(Put("100", "v100")); + ASSERT_OK(Put("999", "v999")); + dbfull()->TEST_CompactMemTable(); + ASSERT_OK(Delete("100")); + ASSERT_OK(Delete("999")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("0,1,1", FilesPerLevel()); + + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put("300", "v300")); + ASSERT_OK(Put("500", "v500")); + dbfull()->TEST_CompactMemTable(); + ASSERT_OK(Put("200", "v200")); + ASSERT_OK(Put("600", "v600")); + ASSERT_OK(Put("900", "v900")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("2,1,1", FilesPerLevel()); - // Compact away the placeholder files we created initially - dbfull()->TEST_CompactRange(1, NULL, NULL); - dbfull()->TEST_CompactRange(2, NULL, NULL); - ASSERT_EQ("2", FilesPerLevel()); + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, NULL, NULL); + dbfull()->TEST_CompactRange(2, NULL, NULL); + ASSERT_EQ("2", FilesPerLevel()); - // Do a memtable compaction. Before bug-fix, the compaction would - // not detect the overlap with level-0 files and would incorrectly place - // the deletion in a deeper level. - ASSERT_OK(Delete("600")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("3", FilesPerLevel()); - ASSERT_EQ("NOT_FOUND", Get("600")); + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete("600")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_EQ("NOT_FOUND", Get("600")); + } while (ChangeOptions()); } TEST(DBTest, L0_CompactionBug_Issue44_a) { - Reopen(); - ASSERT_OK(Put("b", "v")); - Reopen(); - ASSERT_OK(Delete("b")); - ASSERT_OK(Delete("a")); - Reopen(); - ASSERT_OK(Delete("a")); - Reopen(); - ASSERT_OK(Put("a", "v")); - Reopen(); - Reopen(); - ASSERT_EQ("(a->v)", Contents()); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - ASSERT_EQ("(a->v)", Contents()); + Reopen(); + ASSERT_OK(Put("b", "v")); + Reopen(); + ASSERT_OK(Delete("b")); + ASSERT_OK(Delete("a")); + Reopen(); + ASSERT_OK(Delete("a")); + Reopen(); + ASSERT_OK(Put("a", "v")); + Reopen(); + Reopen(); + ASSERT_EQ("(a->v)", Contents()); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(a->v)", Contents()); } TEST(DBTest, L0_CompactionBug_Issue44_b) { - Reopen(); - Put("",""); - Reopen(); - Delete("e"); - Put("",""); - Reopen(); - Put("c", "cv"); - Reopen(); - Put("",""); - Reopen(); - Put("",""); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - Reopen(); - Put("d","dv"); - Reopen(); - Put("",""); - Reopen(); - Delete("d"); - Delete("b"); - Reopen(); - ASSERT_EQ("(->)(c->cv)", Contents()); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - ASSERT_EQ("(->)(c->cv)", Contents()); + Reopen(); + Put("",""); + Reopen(); + Delete("e"); + Put("",""); + Reopen(); + Put("c", "cv"); + Reopen(); + Put("",""); + Reopen(); + Put("",""); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + Reopen(); + Put("d","dv"); + Reopen(); + Put("",""); + Reopen(); + Delete("d"); + Delete("b"); + Reopen(); + ASSERT_EQ("(->)(c->cv)", Contents()); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(->)(c->cv)", Contents()); } TEST(DBTest, ComparatorCheck) { @@ -1150,7 +1285,7 @@ TEST(DBTest, ComparatorCheck) { } }; NewComparator cmp; - Options new_options; + Options new_options = CurrentOptions(); new_options.comparator = &cmp; Status s = TryReopen(&new_options); ASSERT_TRUE(!s.ok()); @@ -1185,9 +1320,10 @@ TEST(DBTest, CustomComparator) { } }; NumberComparator cmp; - Options new_options; + Options new_options = CurrentOptions(); new_options.create_if_missing = true; new_options.comparator = &cmp; + new_options.filter_policy = NULL; // Cannot use bloom filters new_options.write_buffer_size = 1000; // Compact more often DestroyAndReopen(&new_options); ASSERT_OK(Put("[10]", "ten")); @@ -1197,6 +1333,8 @@ TEST(DBTest, CustomComparator) { ASSERT_EQ("ten", Get("[0xa]")); ASSERT_EQ("twenty", Get("[20]")); ASSERT_EQ("twenty", Get("[0x14]")); + ASSERT_EQ("NOT_FOUND", Get("[15]")); + ASSERT_EQ("NOT_FOUND", Get("[0xf]")); Compact("[0]", "[9999]"); } @@ -1285,7 +1423,7 @@ TEST(DBTest, DBOpen_Options) { // Check that number of files does not grow when we are out of space TEST(DBTest, NoSpace) { - Options options; + Options options = CurrentOptions(); options.env = env_; Reopen(&options); @@ -1314,6 +1452,53 @@ TEST(DBTest, FilesDeletedAfterCompaction) { ASSERT_EQ(CountFiles(), num_files); } +TEST(DBTest, BloomFilter) { + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.block_cache = NewLRUCache(0); // Prevent cache hits + options.filter_policy = NewBloomFilterPolicy(10); + Reopen(&options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + Compact("a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(Key(i), Key(i))); + } + dbfull()->TEST_CompactMemTable(); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.Release_Store(env_); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + ASSERT_LE(reads, N + 2*N/100); + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + ASSERT_LE(reads, 3*N/100); + + env_->delay_sstable_sync_.Release_Store(NULL); + Close(); + delete options.block_cache; + delete options.filter_policy; +} + // Multi-threaded test: namespace { @@ -1381,33 +1566,35 @@ static void MTThreadBody(void* arg) { } // namespace TEST(DBTest, MultiThreaded) { - // Initialize state - MTState mt; - mt.test = this; - mt.stop.Release_Store(0); - for (int id = 0; id < kNumThreads; id++) { - mt.counter[id].Release_Store(0); - mt.thread_done[id].Release_Store(0); - } - - // Start threads - MTThread thread[kNumThreads]; - for (int id = 0; id < kNumThreads; id++) { - thread[id].state = &mt; - thread[id].id = id; - env_->StartThread(MTThreadBody, &thread[id]); - } - - // Let them run for a while - env_->SleepForMicroseconds(kTestSeconds * 1000000); - - // Stop the threads and wait for them to finish - mt.stop.Release_Store(&mt); - for (int id = 0; id < kNumThreads; id++) { - while (mt.thread_done[id].Acquire_Load() == NULL) { - env_->SleepForMicroseconds(100000); + do { + // Initialize state + MTState mt; + mt.test = this; + mt.stop.Release_Store(0); + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].Release_Store(0); + mt.thread_done[id].Release_Store(0); } - } + + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + env_->StartThread(MTThreadBody, &thread[id]); + } + + // Let them run for a while + env_->SleepForMicroseconds(kTestSeconds * 1000000); + + // Stop the threads and wait for them to finish + mt.stop.Release_Store(&mt); + for (int id = 0; id < kNumThreads; id++) { + while (mt.thread_done[id].Acquire_Load() == NULL) { + env_->SleepForMicroseconds(100000); + } + } + } while (ChangeOptions()); } namespace { @@ -1573,70 +1760,73 @@ static bool CompareIterators(int step, TEST(DBTest, Randomized) { Random rnd(test::RandomSeed()); - ModelDB model(last_options_); - const int N = 10000; - const Snapshot* model_snap = NULL; - const Snapshot* db_snap = NULL; - std::string k, v; - for (int step = 0; step < N; step++) { - if (step % 100 == 0) { - fprintf(stderr, "Step %d of %d\n", step, N); - } - int p = rnd.Uniform(100); - if (p < 45) { // Put - k = RandomKey(&rnd); - v = RandomString(&rnd, - rnd.OneIn(20) - ? 100 + rnd.Uniform(100) - : rnd.Uniform(8)); - ASSERT_OK(model.Put(WriteOptions(), k, v)); - ASSERT_OK(db_->Put(WriteOptions(), k, v)); - - } else if (p < 90) { // Delete - k = RandomKey(&rnd); - ASSERT_OK(model.Delete(WriteOptions(), k)); - ASSERT_OK(db_->Delete(WriteOptions(), k)); - - - } else { // Multi-element batch - WriteBatch b; - const int num = rnd.Uniform(8); - for (int i = 0; i < num; i++) { - if (i == 0 || !rnd.OneIn(10)) { - k = RandomKey(&rnd); - } else { - // Periodically re-use the same key from the previous iter, so - // we have multiple entries in the write batch for the same key - } - if (rnd.OneIn(2)) { - v = RandomString(&rnd, rnd.Uniform(10)); - b.Put(k, v); - } else { - b.Delete(k); + do { + ModelDB model(CurrentOptions()); + const int N = 10000; + const Snapshot* model_snap = NULL; + const Snapshot* db_snap = NULL; + std::string k, v; + for (int step = 0; step < N; step++) { + if (step % 100 == 0) { + fprintf(stderr, "Step %d of %d\n", step, N); + } + // TODO(sanjay): Test Get() works + int p = rnd.Uniform(100); + if (p < 45) { // Put + k = RandomKey(&rnd); + v = RandomString(&rnd, + rnd.OneIn(20) + ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + + } else if (p < 90) { // Delete + k = RandomKey(&rnd); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + + + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); } - ASSERT_OK(model.Write(WriteOptions(), &b)); - ASSERT_OK(db_->Write(WriteOptions(), &b)); - } - if ((step % 100) == 0) { - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); - // Save a snapshot from each DB this time that we'll use next - // time we compare things, to make sure the current state is - // preserved with the snapshot - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); + if ((step % 100) == 0) { + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); - Reopen(); - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); + Reopen(); + ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - model_snap = model.GetSnapshot(); - db_snap = db_->GetSnapshot(); + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } } - } - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); + if (model_snap != NULL) model.ReleaseSnapshot(model_snap); + if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); + } while (ChangeOptions()); } std::string MakeKey(unsigned int num) { diff --git a/db/dbformat.cc b/db/dbformat.cc index 9168f99..28e11b3 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -98,6 +98,26 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const { } } +const char* InternalFilterPolicy::Name() const { + return user_policy_->Name(); +} + +void InternalFilterPolicy::CreateFilter(const Slice* keys, int n, + std::string* dst) const { + // We rely on the fact that the code in table.cc does not mind us + // adjusting keys[]. + Slice* mkey = const_cast(keys); + for (int i = 0; i < n; i++) { + mkey[i] = ExtractUserKey(keys[i]); + // TODO(sanjay): Suppress dups? + } + user_policy_->CreateFilter(keys, n, dst); +} + +bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const { + return user_policy_->KeyMayMatch(ExtractUserKey(key), f); +} + LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { size_t usize = user_key.size(); size_t needed = usize + 13; // A conservative estimate diff --git a/db/dbformat.h b/db/dbformat.h index 044717d..f7f64da 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -8,6 +8,7 @@ #include #include "leveldb/comparator.h" #include "leveldb/db.h" +#include "leveldb/filter_policy.h" #include "leveldb/slice.h" #include "leveldb/table_builder.h" #include "util/coding.h" @@ -123,6 +124,17 @@ class InternalKeyComparator : public Comparator { int Compare(const InternalKey& a, const InternalKey& b) const; }; +// Filter policy wrapper that converts from internal keys to user keys +class InternalFilterPolicy : public FilterPolicy { + private: + const FilterPolicy* const user_policy_; + public: + explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } + virtual const char* Name() const; + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const; + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; +}; + // Modules in this directory should keep internal keys wrapped inside // the following class instead of plain strings so that we do not // incorrectly use string comparisons instead of an InternalKeyComparator. diff --git a/db/repair.cc b/db/repair.cc index 511c66b..022d52f 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -48,7 +48,8 @@ class Repairer { : dbname_(dbname), env_(options.env), icmp_(options.comparator), - options_(SanitizeOptions(dbname, &icmp_, options)), + ipolicy_(options.filter_policy), + options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), owns_info_log_(options_.info_log != options.info_log), owns_cache_(options_.block_cache != options.block_cache), next_file_number_(1) { @@ -99,6 +100,7 @@ class Repairer { std::string const dbname_; Env* const env_; InternalKeyComparator const icmp_; + InternalFilterPolicy const ipolicy_; Options const options_; bool owns_info_log_; bool owns_cache_; diff --git a/db/table_cache.cc b/db/table_cache.cc index cae79bd..497db27 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -42,23 +42,18 @@ TableCache::~TableCache() { delete cache_; } -Iterator* TableCache::NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr) { - if (tableptr != NULL) { - *tableptr = NULL; - } - +Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, + Cache::Handle** handle) { + Status s; char buf[sizeof(file_number)]; EncodeFixed64(buf, file_number); Slice key(buf, sizeof(buf)); - Cache::Handle* handle = cache_->Lookup(key); - if (handle == NULL) { + *handle = cache_->Lookup(key); + if (*handle == NULL) { std::string fname = TableFileName(dbname_, file_number); RandomAccessFile* file = NULL; Table* table = NULL; - Status s = env_->NewRandomAccessFile(fname, &file); + s = env_->NewRandomAccessFile(fname, &file); if (s.ok()) { s = Table::Open(*options_, file, file_size, &table); } @@ -68,13 +63,28 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, delete file; // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. - return NewErrorIterator(s); + } else { + TableAndFile* tf = new TableAndFile; + tf->file = file; + tf->table = table; + *handle = cache_->Insert(key, tf, 1, &DeleteEntry); } + } + return s; +} - TableAndFile* tf = new TableAndFile; - tf->file = file; - tf->table = table; - handle = cache_->Insert(key, tf, 1, &DeleteEntry); +Iterator* TableCache::NewIterator(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + Table** tableptr) { + if (tableptr != NULL) { + *tableptr = NULL; + } + + Cache::Handle* handle = NULL; + Status s = FindTable(file_number, file_size, &handle); + if (!s.ok()) { + return NewErrorIterator(s); } Table* table = reinterpret_cast(cache_->Value(handle))->table; @@ -86,6 +96,22 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, return result; } +Status TableCache::Get(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + const Slice& k, + void* arg, + void (*saver)(void*, const Slice&, const Slice&)) { + Cache::Handle* handle = NULL; + Status s = FindTable(file_number, file_size, &handle); + if (s.ok()) { + Table* t = reinterpret_cast(cache_->Value(handle))->table; + s = t->InternalGet(options, k, arg, saver); + cache_->Release(handle); + } + return s; +} + void TableCache::Evict(uint64_t file_number) { char buf[sizeof(file_number)]; EncodeFixed64(buf, file_number); diff --git a/db/table_cache.h b/db/table_cache.h index 0f3c73b..8cf4aaf 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -35,6 +35,15 @@ class TableCache { uint64_t file_size, Table** tableptr = NULL); + // If a seek to internal key "k" in specified file finds an entry, + // call (*handle_result)(arg, found_key, found_value). + Status Get(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + const Slice& k, + void* arg, + void (*handle_result)(void*, const Slice&, const Slice&)); + // Evict any entry for the specified file number void Evict(uint64_t file_number); @@ -43,6 +52,8 @@ class TableCache { const std::string dbname_; const Options* options_; Cache* cache_; + + Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**); }; } // namespace leveldb diff --git a/db/version_set.cc b/db/version_set.cc index 1310aeb..1f48419 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -255,35 +255,34 @@ void Version::AddIterators(const ReadOptions& options, } } -// If "*iter" points at a value or deletion for user_key, store -// either the value, or a NotFound error and return true. -// Else return false. -static bool GetValue(const Comparator* cmp, - Iterator* iter, const Slice& user_key, - std::string* value, - Status* s) { - if (!iter->Valid()) { - return false; - } +// Callback from TableCache::Get() +namespace { +enum SaverState { + kNotFound, + kFound, + kDeleted, + kCorrupt, +}; +struct Saver { + SaverState state; + const Comparator* ucmp; + Slice user_key; + std::string* value; +}; +} +static void SaveValue(void* arg, const Slice& ikey, const Slice& v) { + Saver* s = reinterpret_cast(arg); ParsedInternalKey parsed_key; - if (!ParseInternalKey(iter->key(), &parsed_key)) { - *s = Status::Corruption("corrupted key for ", user_key); - return true; - } - if (cmp->Compare(parsed_key.user_key, user_key) != 0) { - return false; - } - switch (parsed_key.type) { - case kTypeDeletion: - *s = Status::NotFound(Slice()); // Use an empty error message for speed - break; - case kTypeValue: { - Slice v = iter->value(); - value->assign(v.data(), v.size()); - break; + if (!ParseInternalKey(ikey, &parsed_key)) { + s->state = kCorrupt; + } else { + if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { + s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted; + if (s->state == kFound) { + s->value->assign(v.data(), v.size()); + } } } - return true; } static bool NewestFirst(FileMetaData* a, FileMetaData* b) { @@ -361,21 +360,27 @@ Status Version::Get(const ReadOptions& options, last_file_read = f; last_file_read_level = level; - Iterator* iter = vset_->table_cache_->NewIterator( - options, - f->number, - f->file_size); - iter->Seek(ikey); - const bool done = GetValue(ucmp, iter, user_key, value, &s); - if (!iter->status().ok()) { - s = iter->status(); - delete iter; + Saver saver; + saver.state = kNotFound; + saver.ucmp = ucmp; + saver.user_key = user_key; + saver.value = value; + s = vset_->table_cache_->Get(options, f->number, f->file_size, + ikey, &saver, SaveValue); + if (!s.ok()) { return s; - } else { - delete iter; - if (done) { + } + switch (saver.state) { + case kNotFound: + break; // Keep searching in other files + case kFound: + return s; + case kDeleted: + s = Status::NotFound(Slice()); // Use empty error message for speed + return s; + case kCorrupt: + s = Status::Corruption("corrupted key for ", user_key); return s; - } } } } diff --git a/doc/index.html b/doc/index.html index 472f7cd..521d2ba 100644 --- a/doc/index.html +++ b/doc/index.html @@ -400,6 +400,69 @@ We might want to prefix filename keys with one letter (say '/') and over just the metadata do not force us to fetch and cache bulky file contents.

      +

      Filters

      +

      +Because of the way leveldb data is organized on disk, +a single Get() call may involve multiple reads from disk. +The optional FilterPolicy mechanism can be used to reduce +the number of disk reads substantially. +

      +   leveldb::Options options;
      +   options.filter_policy = NewBloomFilter(10);
      +   leveldb::DB* db;
      +   leveldb::DB::Open(options, "/tmp/testdb", &db);
      +   ... use the database ...
      +   delete db;
      +   delete options.filter_policy;
      +
      +The preceding code associates a +Bloom filter +based filtering policy with the database. Bloom filter based +filtering relies on keeping some number of bits of data in memory per +key (in this case 10 bits per key since that is the argument we passed +to NewBloomFilter). This filter will reduce the number of unnecessary +disk reads needed for Get() calls by a factor of +approximately a 100. Increasing the bits per key will lead to a +larger reduction at the cost of more memory usage. We recommend that +applications whose working set does not fit in memory and that do a +lot of random reads set a filter policy. +

      +If you are using a custom comparator, you should ensure that the filter +policy you are using is compatible with your comparator. For example, +consider a comparator that ignores trailing spaces when comparing keys. +NewBloomFilter must not be used with such a comparator. +Instead, the application should provide a custom filter policy that +also ignores trailing spaces. For example: +

      +  class CustomFilterPolicy : public leveldb::FilterPolicy {
      +   private:
      +    FilterPolicy* builtin_policy_;
      +   public:
      +    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
      +    ~CustomFilterPolicy() { delete builtin_policy_; }
      +
      +    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
      +
      +    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
      +      // Use builtin bloom filter code after removing trailing spaces
      +      std::vector<Slice> trimmed(n);
      +      for (int i = 0; i < n; i++) {
      +        trimmed[i] = RemoveTrailingSpaces(keys[i]);
      +      }
      +      return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
      +    }
      +
      +    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
      +      // Use builtin bloom filter code after removing trailing spaces
      +      return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
      +    }
      +  };
      +
      +

      +Advanced applications may provide a filter policy that does not use +a bloom filter but uses some other mechanism for summarizing a set +of keys. See leveldb/filter_policy.h for detail. +

      Checksums

      leveldb associates checksums with all data it stores in the file system. diff --git a/doc/table_format.txt b/doc/table_format.txt index ad5aa4b..d0f3065 100644 --- a/doc/table_format.txt +++ b/doc/table_format.txt @@ -47,6 +47,47 @@ the BlockHandle of the metaindex and index blocks as well as a magic number. // (40==2*BlockHandle::kMaxEncodedLength) magic: fixed64; // == 0xdb4775248b80fb57 +"filter" Meta Block +------------------- + +If a "FilterPolicy" was specified when the database was opened, a +filter block is stored in each table. The "metaindex" block contains +an entry that maps from "filter." to the BlockHandle for the filter +block where "" is the string returned by the filter policy's +"Name()" method. + +The filter block stores a sequence of filters, where filter i contains +the output of FilterPolicy::CreateFilter() on all keys that are stored +in a block whose file offset falls within the range + + [ i*base ... (i+1)*base-1 ] + +Currently, "base" is 2KB. So for example, if blocks X and Y start in +the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be +converted to a filter by calling FilterPolicy::CreateFilter(), and the +resulting filter will be stored as the first filter in the filter +block. + +The filter block is formatted as follows: + + [filter 0] + [filter 1] + [filter 2] + ... + [filter N-1] + + [offset of filter 0] : 4 bytes + [offset of filter 1] : 4 bytes + [offset of filter 2] : 4 bytes + ... + [offset of filter N-1] : 4 bytes + + [offset of beginning of offset array] : 4 bytes + lg(base) : 1 byte + +The offset array at the end of the filter block allows efficient +mapping from a data block offset to the corresponding filter. + "stats" Meta Block ------------------ diff --git a/include/leveldb/c.h b/include/leveldb/c.h index 0be993d..70e3cc6 100644 --- a/include/leveldb/c.h +++ b/include/leveldb/c.h @@ -55,6 +55,7 @@ typedef struct leveldb_cache_t leveldb_cache_t; typedef struct leveldb_comparator_t leveldb_comparator_t; typedef struct leveldb_env_t leveldb_env_t; typedef struct leveldb_filelock_t leveldb_filelock_t; +typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t; typedef struct leveldb_iterator_t leveldb_iterator_t; typedef struct leveldb_logger_t leveldb_logger_t; typedef struct leveldb_options_t leveldb_options_t; @@ -127,6 +128,11 @@ extern void leveldb_approximate_sizes( const char* const* range_limit_key, const size_t* range_limit_key_len, uint64_t* sizes); +extern void leveldb_compact_range( + leveldb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + /* Management operations */ extern void leveldb_destroy_db( @@ -177,6 +183,9 @@ extern void leveldb_options_destroy(leveldb_options_t*); extern void leveldb_options_set_comparator( leveldb_options_t*, leveldb_comparator_t*); +extern void leveldb_options_set_filter_policy( + leveldb_options_t*, + leveldb_filterpolicy_t*); extern void leveldb_options_set_create_if_missing( leveldb_options_t*, unsigned char); extern void leveldb_options_set_error_if_exists( @@ -209,6 +218,26 @@ extern leveldb_comparator_t* leveldb_comparator_create( const char* (*name)(void*)); extern void leveldb_comparator_destroy(leveldb_comparator_t*); +/* Filter policy */ + +extern leveldb_filterpolicy_t* leveldb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + const char* (*name)(void*)); +extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*); + +extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom( + int bits_per_key); + /* Read options */ extern leveldb_readoptions_t* leveldb_readoptions_create(); diff --git a/include/leveldb/db.h b/include/leveldb/db.h index c7d5167..481aad6 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -14,7 +14,7 @@ namespace leveldb { // Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 3; +static const int kMinorVersion = 4; struct Options; struct ReadOptions; diff --git a/include/leveldb/filter_policy.h b/include/leveldb/filter_policy.h new file mode 100644 index 0000000..1fba080 --- /dev/null +++ b/include/leveldb/filter_policy.h @@ -0,0 +1,70 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A database can be configured with a custom FilterPolicy object. +// This object is responsible for creating a small filter from a set +// of keys. These filters are stored in leveldb and are consulted +// automatically by leveldb to decide whether or not to read some +// information from disk. In many cases, a filter can cut down the +// number of disk seeks form a handful to a single disk seek per +// DB::Get() call. +// +// Most people will want to use the builtin bloom filter support (see +// NewBloomFilterPolicy() below). + +#ifndef STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ +#define STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ + +#include + +namespace leveldb { + +class Slice; + +class FilterPolicy { + public: + virtual ~FilterPolicy(); + + // Return the name of this policy. Note that if the filter encoding + // changes in an incompatible way, the name returned by this method + // must be changed. Otherwise, old incompatible filters may be + // passed to methods of this type. + virtual const char* Name() const = 0; + + // keys[0,n-1] contains a list of keys (potentially with duplicates) + // that are ordered according to the user supplied comparator. + // Append a filter that summarizes keys[0,n-1] to *dst. + // + // Warning: do not change the initial contents of *dst. Instead, + // append the newly constructed filter to *dst. + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) + const = 0; + + // "filter" contains the data appended by a preceding call to + // CreateFilter() on this class. This method must return true if + // the key was in the list of keys passed to CreateFilter(). + // This method may return true or false if the key was not on the + // list, but it should aim to return false with a high probability. + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; +}; + +// Return a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. A good value for bits_per_key +// is 10, which yields a filter with ~ 1% false positive rate. +// +// Callers must delete the result after any database that is using the +// result has been closed. +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); + +} + +#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 79111a0..fdda718 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -12,6 +12,7 @@ namespace leveldb { class Cache; class Comparator; class Env; +class FilterPolicy; class Logger; class Snapshot; @@ -127,6 +128,13 @@ struct Options { // efficiently detect that and will switch to uncompressed mode. CompressionType compression; + // If non-NULL, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: NULL + const FilterPolicy* filter_policy; + // Create an Options object with default values for all fields. Options(); }; diff --git a/include/leveldb/table.h b/include/leveldb/table.h index 0cbdd40..a9746c3 100644 --- a/include/leveldb/table.h +++ b/include/leveldb/table.h @@ -12,9 +12,11 @@ namespace leveldb { class Block; class BlockHandle; +class Footer; struct Options; class RandomAccessFile; struct ReadOptions; +class TableCache; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from @@ -60,6 +62,19 @@ class Table { explicit Table(Rep* rep) { rep_ = rep; } static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); + // Calls (*handle_result)(arg, ...) with the entry found after a call + // to Seek(key). May not make such a call if filter policy says + // that key is not present. + friend class TableCache; + Status InternalGet( + const ReadOptions&, const Slice& key, + void* arg, + void (*handle_result)(void* arg, const Slice& k, const Slice& v)); + + + void ReadMeta(const Footer& footer); + void ReadFilter(const Slice& filter_handle_value); + // No copying allowed Table(const Table&); void operator=(const Table&); diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h index 9ac0868..5fd1dc7 100644 --- a/include/leveldb/table_builder.h +++ b/include/leveldb/table_builder.h @@ -77,6 +77,7 @@ class TableBuilder { private: bool ok() const { return status().ok(); } void WriteBlock(BlockBuilder* block, BlockHandle* handle); + void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle); struct Rep; Rep* rep_; diff --git a/port/port_android.h b/port/port_android.h index 92f0090..b733388 100644 --- a/port/port_android.h +++ b/port/port_android.h @@ -78,6 +78,9 @@ class CondVar { // On ARM chipsets #include #include "leveldb/comparator.h" +#include "table/format.h" #include "util/coding.h" #include "util/logging.h" @@ -19,10 +20,10 @@ inline uint32_t Block::NumRestarts() const { return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } -Block::Block(const char* data, size_t size, bool take_ownership) - : data_(data), - size_(size), - owned_(take_ownership) { +Block::Block(const BlockContents& contents) + : data_(contents.data.data()), + size_(contents.data.size()), + owned_(contents.heap_allocated) { if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { diff --git a/table/block.h b/table/block.h index 76088a4..2493eb9 100644 --- a/table/block.h +++ b/table/block.h @@ -11,14 +11,13 @@ namespace leveldb { +struct BlockContents; class Comparator; class Block { public: // Initialize the block with the specified contents. - // Takes ownership of data[] and will delete[] it when done iff - // "take_ownership is true. - Block(const char* data, size_t size, bool take_ownership); + explicit Block(const BlockContents& contents); ~Block(); diff --git a/table/filter_block.cc b/table/filter_block.cc new file mode 100644 index 0000000..203e15c --- /dev/null +++ b/table/filter_block.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "leveldb/filter_policy.h" +#include "util/coding.h" + +namespace leveldb { + +// See doc/table_format.txt for an explanation of the filter block format. + +// Generate new filter every 2KB of data +static const size_t kFilterBaseLg = 11; +static const size_t kFilterBase = 1 << kFilterBaseLg; + +FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy) + : policy_(policy) { +} + +void FilterBlockBuilder::StartBlock(uint64_t block_offset) { + uint64_t filter_index = (block_offset / kFilterBase); + assert(filter_index >= filter_offsets_.size()); + while (filter_index > filter_offsets_.size()) { + GenerateFilter(); + } +} + +void FilterBlockBuilder::AddKey(const Slice& key) { + Slice k = key; + start_.push_back(keys_.size()); + keys_.append(k.data(), k.size()); +} + +Slice FilterBlockBuilder::Finish() { + if (!start_.empty()) { + GenerateFilter(); + } + + // Append array of per-filter offsets + const uint32_t array_offset = result_.size(); + for (size_t i = 0; i < filter_offsets_.size(); i++) { + PutFixed32(&result_, filter_offsets_[i]); + } + + PutFixed32(&result_, array_offset); + result_.push_back(kFilterBaseLg); // Save encoding parameter in result + return Slice(result_); +} + +void FilterBlockBuilder::GenerateFilter() { + const size_t num_keys = start_.size(); + if (num_keys == 0) { + // Fast path if there are no keys for this filter + filter_offsets_.push_back(result_.size()); + return; + } + + // Make list of keys from flattened key structure + start_.push_back(keys_.size()); // Simplify length computation + tmp_keys_.resize(num_keys); + for (size_t i = 0; i < num_keys; i++) { + const char* base = keys_.data() + start_[i]; + size_t length = start_[i+1] - start_[i]; + tmp_keys_[i] = Slice(base, length); + } + + // Generate filter for current set of keys and append to result_. + filter_offsets_.push_back(result_.size()); + policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_); + + tmp_keys_.clear(); + keys_.clear(); + start_.clear(); +} + +FilterBlockReader::FilterBlockReader(const FilterPolicy* policy, + const Slice& contents) + : policy_(policy), + data_(NULL), + offset_(NULL), + num_(0), + base_lg_(0) { + size_t n = contents.size(); + if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array + base_lg_ = contents[n-1]; + uint32_t last_word = DecodeFixed32(contents.data() + n - 5); + if (last_word > n - 5) return; + data_ = contents.data(); + offset_ = data_ + last_word; + num_ = (n - 5 - last_word) / 4; +} + +bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) { + uint64_t index = block_offset >> base_lg_; + if (index < num_) { + uint32_t start = DecodeFixed32(offset_ + index*4); + uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); + if (start <= limit && limit <= (offset_ - data_)) { + Slice filter = Slice(data_ + start, limit - start); + return policy_->KeyMayMatch(key, filter); + } else if (start == limit) { + // Empty filters do not match any keys + return false; + } + } + return true; // Errors are treated as potential matches +} + +} diff --git a/table/filter_block.h b/table/filter_block.h new file mode 100644 index 0000000..c67d010 --- /dev/null +++ b/table/filter_block.h @@ -0,0 +1,68 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#ifndef STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_ +#define STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_ + +#include +#include +#include +#include +#include "leveldb/slice.h" +#include "util/hash.h" + +namespace leveldb { + +class FilterPolicy; + +// A FilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to FilterBlockBuilder must match the regexp: +// (StartBlock AddKey*)* Finish +class FilterBlockBuilder { + public: + explicit FilterBlockBuilder(const FilterPolicy*); + + void StartBlock(uint64_t block_offset); + void AddKey(const Slice& key); + Slice Finish(); + + private: + void GenerateFilter(); + + const FilterPolicy* policy_; + std::string keys_; // Flattened key contents + std::vector start_; // Starting index in keys_ of each key + std::string result_; // Filter data computed so far + std::vector tmp_keys_; // policy_->CreateFilter() argument + std::vector filter_offsets_; + + // No copying allowed + FilterBlockBuilder(const FilterBlockBuilder&); + void operator=(const FilterBlockBuilder&); +}; + +class FilterBlockReader { + public: + // REQUIRES: "contents" and *policy must stay live while *this is live. + FilterBlockReader(const FilterPolicy* policy, const Slice& contents); + bool KeyMayMatch(uint64_t block_offset, const Slice& key); + + private: + const FilterPolicy* policy_; + const char* data_; // Pointer to filter data (at block-start) + const char* offset_; // Pointer to beginning of offset array (at block-end) + size_t num_; // Number of entries in offset array + size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) +}; + +} + +#endif // STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_ diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc new file mode 100644 index 0000000..3a2a07c --- /dev/null +++ b/table/filter_block_test.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "leveldb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class FilterBlockTest { + public: + TestHashFilter policy_; +}; + +TEST(FilterBlockTest, EmptyBuilder) { + FilterBlockBuilder builder(&policy_); + Slice block = builder.Finish(); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); + FilterBlockReader reader(&policy_, block); + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); +} + +TEST(FilterBlockTest, SingleChunk) { + FilterBlockBuilder builder(&policy_); + builder.StartBlock(100); + builder.AddKey("foo"); + builder.AddKey("bar"); + builder.AddKey("box"); + builder.StartBlock(200); + builder.AddKey("box"); + builder.StartBlock(300); + builder.AddKey("hello"); + Slice block = builder.Finish(); + FilterBlockReader reader(&policy_, block); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); + ASSERT_TRUE(reader.KeyMayMatch(100, "box")); + ASSERT_TRUE(reader.KeyMayMatch(100, "hello")); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "missing")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "other")); +} + +TEST(FilterBlockTest, MultiChunk) { + FilterBlockBuilder builder(&policy_); + + // First filter + builder.StartBlock(0); + builder.AddKey("foo"); + builder.StartBlock(2000); + builder.AddKey("bar"); + + // Second filter + builder.StartBlock(3100); + builder.AddKey("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.AddKey("box"); + builder.AddKey("hello"); + + Slice block = builder.Finish(); + FilterBlockReader reader(&policy_, block); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "hello")); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch(3100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello")); + + // Check third filter (empty) + ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello")); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch(9000, "box")); + ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar")); +} + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/table/format.cc b/table/format.cc index 25b85a2..cda1dec 100644 --- a/table/format.cc +++ b/table/format.cc @@ -66,10 +66,10 @@ Status Footer::DecodeFrom(Slice* input) { Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - Block** block, - bool* may_cache) { - *block = NULL; - *may_cache = false; + BlockContents* result) { + result->data = Slice(); + result->cachable = false; + result->heap_allocated = false; // Read the block contents as well as the type/crc footer. // See table_builder.cc for the code that built this structure. @@ -105,11 +105,13 @@ Status ReadBlock(RandomAccessFile* file, // Use it directly under the assumption that it will be live // while the file is open. delete[] buf; - *block = new Block(data, n, false /* do not take ownership */); - *may_cache = false; // Do not double-cache + result->data = Slice(data, n); + result->heap_allocated = false; + result->cachable = false; // Do not double-cache } else { - *block = new Block(buf, n, true /* take ownership */); - *may_cache = true; + result->data = Slice(buf, n); + result->heap_allocated = true; + result->cachable = true; } // Ok @@ -127,8 +129,9 @@ Status ReadBlock(RandomAccessFile* file, return Status::Corruption("corrupted compressed block contents"); } delete[] buf; - *block = new Block(ubuf, ulength, true /* take ownership */); - *may_cache = true; + result->data = Slice(ubuf, ulength); + result->heap_allocated = true; + result->cachable = true; break; } default: diff --git a/table/format.h b/table/format.h index 66a15da..6c0b80c 100644 --- a/table/format.h +++ b/table/format.h @@ -83,16 +83,18 @@ static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull; // 1-byte type + 32-bit crc static const size_t kBlockTrailerSize = 5; -// Read the block identified by "handle" from "file". On success, -// store a pointer to the heap-allocated result in *block and return -// OK. On failure store NULL in *block and return non-OK. -// On success, stores true in *may_cache if the result may be -// cached, false if it must not be cached. +struct BlockContents { + Slice data; // Actual contents of data + bool cachable; // True iff data can be cached + bool heap_allocated; // True iff caller should delete[] data.data() +}; + +// Read the block identified by "handle" from "file". On failure +// return non-OK. On success fill *result and return OK. extern Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - Block** block, - bool* may_cache); + BlockContents* result); // Implementation details follow. Clients should ignore, diff --git a/table/table.cc b/table/table.cc index 07dcffd..dbd6d3a 100644 --- a/table/table.cc +++ b/table/table.cc @@ -5,8 +5,12 @@ #include "leveldb/table.h" #include "leveldb/cache.h" +#include "leveldb/comparator.h" #include "leveldb/env.h" +#include "leveldb/filter_policy.h" +#include "leveldb/options.h" #include "table/block.h" +#include "table/filter_block.h" #include "table/format.h" #include "table/two_level_iterator.h" #include "util/coding.h" @@ -15,6 +19,8 @@ namespace leveldb { struct Table::Rep { ~Rep() { + delete filter; + delete [] filter_data; delete index_block; } @@ -22,6 +28,8 @@ struct Table::Rep { Status status; RandomAccessFile* file; uint64_t cache_id; + FilterBlockReader* filter; + const char* filter_data; BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer Block* index_block; @@ -47,11 +55,13 @@ Status Table::Open(const Options& options, if (!s.ok()) return s; // Read the index block + BlockContents contents; Block* index_block = NULL; if (s.ok()) { - bool may_cache; // Ignored result - s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block, - &may_cache); + s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents); + if (s.ok()) { + index_block = new Block(contents); + } } if (s.ok()) { @@ -63,7 +73,10 @@ Status Table::Open(const Options& options, rep->metaindex_handle = footer.metaindex_handle(); rep->index_block = index_block; rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); + rep->filter_data = NULL; + rep->filter = NULL; *table = new Table(rep); + (*table)->ReadMeta(footer); } else { if (index_block) delete index_block; } @@ -71,6 +84,52 @@ Status Table::Open(const Options& options, return s; } +void Table::ReadMeta(const Footer& footer) { + if (rep_->options.filter_policy == NULL) { + return; // Do not need any metadata + } + + // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates + // it is an empty block. + ReadOptions opt; + BlockContents contents; + if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) { + // Do not propagate errors since meta info is not needed for operation + return; + } + Block* meta = new Block(contents); + + Iterator* iter = meta->NewIterator(BytewiseComparator()); + std::string key = "filter."; + key.append(rep_->options.filter_policy->Name()); + iter->Seek(key); + if (iter->Valid() && iter->key() == Slice(key)) { + ReadFilter(iter->value()); + } + delete iter; + delete meta; +} + +void Table::ReadFilter(const Slice& filter_handle_value) { + Slice v = filter_handle_value; + BlockHandle filter_handle; + if (!filter_handle.DecodeFrom(&v).ok()) { + return; + } + + // We might want to unify with ReadBlock() if we start + // requiring checksum verification in Table::Open. + ReadOptions opt; + BlockContents block; + if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) { + return; + } + if (block.heap_allocated) { + rep_->filter_data = block.data.data(); // Will need to delete later + } + rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data); +} + Table::~Table() { delete rep_; } @@ -107,7 +166,7 @@ Iterator* Table::BlockReader(void* arg, // can add more features in the future. if (s.ok()) { - bool may_cache; + BlockContents contents; if (block_cache != NULL) { char cache_key_buffer[16]; EncodeFixed64(cache_key_buffer, table->rep_->cache_id); @@ -117,14 +176,20 @@ Iterator* Table::BlockReader(void* arg, if (cache_handle != NULL) { block = reinterpret_cast(block_cache->Value(cache_handle)); } else { - s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache); - if (s.ok() && may_cache && options.fill_cache) { - cache_handle = block_cache->Insert( - key, block, block->size(), &DeleteCachedBlock); + s = ReadBlock(table->rep_->file, options, handle, &contents); + if (s.ok()) { + block = new Block(contents); + if (contents.cachable && options.fill_cache) { + cache_handle = block_cache->Insert( + key, block, block->size(), &DeleteCachedBlock); + } } } } else { - s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache); + s = ReadBlock(table->rep_->file, options, handle, &contents); + if (s.ok()) { + block = new Block(contents); + } } } @@ -148,6 +213,39 @@ Iterator* Table::NewIterator(const ReadOptions& options) const { &Table::BlockReader, const_cast(this), options); } +Status Table::InternalGet(const ReadOptions& options, const Slice& k, + void* arg, + void (*saver)(void*, const Slice&, const Slice&)) { + Status s; + Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); + iiter->Seek(k); + if (iiter->Valid()) { + Slice handle_value = iiter->value(); + FilterBlockReader* filter = rep_->filter; + BlockHandle handle; + if (filter != NULL && + handle.DecodeFrom(&handle_value).ok() && + !filter->KeyMayMatch(handle.offset(), k)) { + // Not found + } else { + Slice handle = iiter->value(); + Iterator* block_iter = BlockReader(this, options, iiter->value()); + block_iter->Seek(k); + if (block_iter->Valid()) { + (*saver)(arg, block_iter->key(), block_iter->value()); + } + s = block_iter->status(); + delete block_iter; + } + } + if (s.ok()) { + s = iiter->status(); + } + delete iiter; + return s; +} + + uint64_t Table::ApproximateOffsetOf(const Slice& key) const { Iterator* index_iter = rep_->index_block->NewIterator(rep_->options.comparator); diff --git a/table/table_builder.cc b/table/table_builder.cc index 682ce5b..62002c8 100644 --- a/table/table_builder.cc +++ b/table/table_builder.cc @@ -5,14 +5,15 @@ #include "leveldb/table_builder.h" #include -#include #include "leveldb/comparator.h" #include "leveldb/env.h" +#include "leveldb/filter_policy.h" +#include "leveldb/options.h" #include "table/block_builder.h" +#include "table/filter_block.h" #include "table/format.h" #include "util/coding.h" #include "util/crc32c.h" -#include "util/logging.h" namespace leveldb { @@ -27,6 +28,7 @@ struct TableBuilder::Rep { std::string last_key; int64_t num_entries; bool closed; // Either Finish() or Abandon() has been called. + FilterBlockBuilder* filter_block; // We do not emit the index entry for a block until we have seen the // first key for the next data block. This allows us to use shorter @@ -51,6 +53,8 @@ struct TableBuilder::Rep { index_block(&index_block_options), num_entries(0), closed(false), + filter_block(opt.filter_policy == NULL ? NULL + : new FilterBlockBuilder(opt.filter_policy)), pending_index_entry(false) { index_block_options.block_restart_interval = 1; } @@ -58,10 +62,14 @@ struct TableBuilder::Rep { TableBuilder::TableBuilder(const Options& options, WritableFile* file) : rep_(new Rep(options, file)) { + if (rep_->filter_block != NULL) { + rep_->filter_block->StartBlock(0); + } } TableBuilder::~TableBuilder() { assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_->filter_block; delete rep_; } @@ -98,6 +106,10 @@ void TableBuilder::Add(const Slice& key, const Slice& value) { r->pending_index_entry = false; } + if (r->filter_block != NULL) { + r->filter_block->AddKey(key); + } + r->last_key.assign(key.data(), key.size()); r->num_entries++; r->data_block.Add(key, value); @@ -119,6 +131,9 @@ void TableBuilder::Flush() { r->pending_index_entry = true; r->status = r->file->Flush(); } + if (r->filter_block != NULL) { + r->filter_block->StartBlock(r->offset); + } } void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { @@ -152,6 +167,15 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { break; } } + WriteRawBlock(block_contents, type, handle); + r->compressed_output.clear(); + block->Reset(); +} + +void TableBuilder::WriteRawBlock(const Slice& block_contents, + CompressionType type, + BlockHandle* handle) { + Rep* r = rep_; handle->set_offset(r->offset); handle->set_size(block_contents.size()); r->status = r->file->Append(block_contents); @@ -166,8 +190,6 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { r->offset += block_contents.size() + kBlockTrailerSize; } } - r->compressed_output.clear(); - block->Reset(); } Status TableBuilder::status() const { @@ -179,13 +201,32 @@ Status TableBuilder::Finish() { Flush(); assert(!r->closed); r->closed = true; - BlockHandle metaindex_block_handle; - BlockHandle index_block_handle; + + BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle; + + // Write filter block + if (ok() && r->filter_block != NULL) { + WriteRawBlock(r->filter_block->Finish(), kNoCompression, + &filter_block_handle); + } + + // Write metaindex block if (ok()) { BlockBuilder meta_index_block(&r->options); + if (r->filter_block != NULL) { + // Add mapping from "filter.Name" to location of filter data + std::string key = "filter."; + key.append(r->options.filter_policy->Name()); + std::string handle_encoding; + filter_block_handle.EncodeTo(&handle_encoding); + meta_index_block.Add(key, handle_encoding); + } + // TODO(postrelease): Add stats and other meta blocks WriteBlock(&meta_index_block, &metaindex_block_handle); } + + // Write index block if (ok()) { if (r->pending_index_entry) { r->options.comparator->FindShortSuccessor(&r->last_key); @@ -196,6 +237,8 @@ Status TableBuilder::Finish() { } WriteBlock(&r->index_block, &index_block_handle); } + + // Write footer if (ok()) { Footer footer; footer.set_metaindex_handle(metaindex_block_handle); diff --git a/table/table_test.cc b/table/table_test.cc index 0c8e676..57cea25 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -168,8 +168,6 @@ class Constructor { // Construct the data structure from the data in "data" virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; - virtual size_t NumBytes() const = 0; - virtual Iterator* NewIterator() const = 0; virtual const KVMap& data() { return data_; } @@ -185,7 +183,6 @@ class BlockConstructor: public Constructor { explicit BlockConstructor(const Comparator* cmp) : Constructor(cmp), comparator_(cmp), - block_size_(-1), block_(NULL) { } ~BlockConstructor() { delete block_; @@ -201,22 +198,21 @@ class BlockConstructor: public Constructor { builder.Add(it->first, it->second); } // Open the block - Slice block_data = builder.Finish(); - block_size_ = block_data.size(); - char* block_data_copy = new char[block_size_]; - memcpy(block_data_copy, block_data.data(), block_size_); - block_ = new Block(block_data_copy, block_size_, true /* take ownership */); + data_ = builder.Finish().ToString(); + BlockContents contents; + contents.data = data_; + contents.cachable = false; + contents.heap_allocated = false; + block_ = new Block(contents); return Status::OK(); } - virtual size_t NumBytes() const { return block_size_; } - virtual Iterator* NewIterator() const { return block_->NewIterator(comparator_); } private: const Comparator* comparator_; - int block_size_; + std::string data_; Block* block_; BlockConstructor(); @@ -253,7 +249,6 @@ class TableConstructor: public Constructor { table_options.comparator = options.comparator; return Table::Open(table_options, source_, sink.contents().size(), &table_); } - virtual size_t NumBytes() const { return source_->Size(); } virtual Iterator* NewIterator() const { return table_->NewIterator(ReadOptions()); @@ -342,10 +337,6 @@ class MemTableConstructor: public Constructor { } return Status::OK(); } - virtual size_t NumBytes() const { - return memtable_->ApproximateMemoryUsage(); - } - virtual Iterator* NewIterator() const { return new KeyConvertingIterator(memtable_->NewIterator()); } @@ -379,13 +370,6 @@ class DBConstructor: public Constructor { } return Status::OK(); } - virtual size_t NumBytes() const { - Range r("", "\xff\xff"); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - virtual Iterator* NewIterator() const { return db_->NewIterator(ReadOptions()); } @@ -809,7 +793,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) { ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); } diff --git a/util/bloom.cc b/util/bloom.cc new file mode 100644 index 0000000..d7941cd --- /dev/null +++ b/util/bloom.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/filter_policy.h" + +#include "leveldb/slice.h" +#include "util/hash.h" + +namespace leveldb { + +namespace { +static uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +class BloomFilterPolicy : public FilterPolicy { + private: + size_t bits_per_key_; + size_t k_; + + public: + explicit BloomFilterPolicy(int bits_per_key) + : bits_per_key_(bits_per_key) { + // We intentionally round down to reduce probing cost a little bit + k_ = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (k_ < 1) k_ = 1; + if (k_ > 30) k_ = 30; + } + + virtual const char* Name() const { + return "leveldb.BuiltinBloomFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + // Compute bloom filter size (in both bits and bytes) + size_t bits = n * bits_per_key_; + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + size_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast(k_)); // Remember # of probes in filter + char* array = &(*dst)[init_size]; + for (size_t i = 0; i < n; i++) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + uint32_t h = BloomHash(keys[i]); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k_; j++) { + const uint32_t bitpos = h % bits; + array[bitpos/8] |= (1 << (bitpos % 8)); + h += delta; + } + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2) return false; + + const char* array = bloom_filter.data(); + const size_t bits = (len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const size_t k = array[len-1]; + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + + uint32_t h = BloomHash(key); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k; j++) { + const uint32_t bitpos = h % bits; + if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; + h += delta; + } + return true; + } +}; +} + +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { + return new BloomFilterPolicy(bits_per_key); +} + +} // namespace leveldb diff --git a/util/bloom_test.cc b/util/bloom_test.cc new file mode 100644 index 0000000..4a6ea1b --- /dev/null +++ b/util/bloom_test.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/filter_policy.h" + +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static const int kVerbose = 1; + +static Slice Key(int i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class BloomTest { + private: + const FilterPolicy* policy_; + std::string filter_; + std::vector keys_; + + public: + BloomTest() : policy_(NewBloomFilterPolicy(10)) { } + + ~BloomTest() { + delete policy_; + } + + void Reset() { + keys_.clear(); + filter_.clear(); + } + + void Add(const Slice& s) { + keys_.push_back(s.ToString()); + } + + void Build() { + std::vector key_slices; + for (size_t i = 0; i < keys_.size(); i++) { + key_slices.push_back(Slice(keys_[i])); + } + filter_.clear(); + policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_); + keys_.clear(); + if (kVerbose >= 2) DumpFilter(); + } + + size_t FilterSize() const { + return filter_.size(); + } + + void DumpFilter() { + fprintf(stderr, "F("); + for (size_t i = 0; i+1 < filter_.size(); i++) { + const unsigned int c = static_cast(filter_[i]); + for (int j = 0; j < 8; j++) { + fprintf(stderr, "%c", (c & (1 <KeyMayMatch(s, filter_); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST(BloomTest, EmptyFilter) { + ASSERT_TRUE(! Matches("hello")); + ASSERT_TRUE(! Matches("world")); +} + +TEST(BloomTest, Small) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(! Matches("x")); + ASSERT_TRUE(! Matches("foo")); +} + +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + +TEST(BloomTest, VaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often + else good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + +// Different bits-per-byte + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/filter_policy.cc b/util/filter_policy.cc new file mode 100644 index 0000000..7b045c8 --- /dev/null +++ b/util/filter_policy.cc @@ -0,0 +1,11 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/filter_policy.h" + +namespace leveldb { + +FilterPolicy::~FilterPolicy() { } + +} // namespace leveldb diff --git a/util/options.cc b/util/options.cc index bb97838..76af5b9 100644 --- a/util/options.cc +++ b/util/options.cc @@ -21,7 +21,8 @@ Options::Options() block_cache(NULL), block_size(4096), block_restart_interval(16), - compression(kSnappyCompression) { + compression(kSnappyCompression), + filter_policy(NULL) { } -- cgit v1.2.3 From 158f767acaed4c39cbb3ee8128fe896e155ec40c Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Wed, 30 May 2012 16:53:11 +0000 Subject: Remove static initializer; fix endian-ness detection; fix build on various platforms; improve android port speed git-svn-id: http://leveldb.googlecode.com/svn/trunk@66 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 27 ++++--- build_detect_platform | 37 +++++++--- db/c_test.c | 11 ++- db/db_bench.cc | 10 ++- db/db_impl.cc | 23 ++++-- db/db_impl.h | 2 +- db/db_test.cc | 63 ++++++++++++++--- db/version_set.cc | 4 +- db/version_set.h | 9 +++ doc/bench/db_bench_sqlite3.cc | 28 +++++++- doc/bench/db_bench_tree_db.cc | 30 ++++++-- port/atomic_pointer.h | 18 +++-- port/port.h | 2 - port/port_android.cc | 64 ----------------- port/port_android.h | 159 ------------------------------------------ port/port_example.h | 10 +++ port/port_posix.cc | 4 ++ port/port_posix.h | 33 ++++++--- util/coding_test.cc | 23 ++++++ util/comparator.cc | 11 ++- 20 files changed, 283 insertions(+), 285 deletions(-) delete mode 100644 port/port_android.cc delete mode 100644 port/port_android.h diff --git a/Makefile b/Makefile index b961ba1..5cd9d37 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,6 @@ # found in the LICENSE file. See the AUTHORS file for names of contributors. # Inherit some settings from environment variables, if available -CXX ?= g++ -CC ?= gcc INSTALL_PATH ?= $(CURDIR) #----------------------------------------------- @@ -63,6 +61,13 @@ default: all # Should we build shared libraries? ifneq ($(PLATFORM_SHARED_EXT),) + +ifneq ($(PLATFORM_SHARED_VERSIONED),true) +SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) +SHARED2 = $(SHARED1) +SHARED3 = $(SHARED1) +SHARED = $(SHARED1) +else # Update db.h if you change these. SHARED_MAJOR = 1 SHARED_MINOR = 4 @@ -70,14 +75,17 @@ SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) SHARED = $(SHARED1) $(SHARED2) $(SHARED3) -$(SHARED3): - $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(INSTALL_PATH)/$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3) -$(SHARED2): $(SHARED3) - ln -fs $(SHARED3) $(SHARED2) $(SHARED1): $(SHARED3) ln -fs $(SHARED3) $(SHARED1) +$(SHARED2): $(SHARED3) + ln -fs $(SHARED3) $(SHARED2) endif +$(SHARED3): + $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3) + +endif # PLATFORM_SHARED_EXT + all: $(SHARED) $(LIBRARY) check: all $(PROGRAMS) $(TESTS) @@ -164,9 +172,10 @@ memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHAR ifeq ($(PLATFORM), IOS) # For iOS, create universal object files to be used on both the simulator and # a device. -SIMULATORROOT=/Developer/Platforms/iPhoneSimulator.platform/Developer -DEVICEROOT=/Developer/Platforms/iPhoneOS.platform/Developer -IOSVERSION=$(shell defaults read /Developer/Platforms/iPhoneOS.platform/version CFBundleShortVersionString) +PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms +SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer +DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer +IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString) .cc.o: mkdir -p ios-x86/$(dir $@) diff --git a/build_detect_platform b/build_detect_platform index b71bf02..959a7d6 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -4,18 +4,27 @@ # argument, which in turn gets read while processing Makefile. # # The output will set the following variables: +# CC C Compiler path +# CXX C++ Compiler path # PLATFORM_LDFLAGS Linker flags # PLATFORM_SHARED_EXT Extension for shared libraries # PLATFORM_SHARED_LDFLAGS Flags for building shared library # PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library # PLATFORM_CCFLAGS C compiler flags # PLATFORM_CXXFLAGS C++ compiler flags. Will contain: -# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present -# -DLEVELDB_PLATFORM_NOATOMIC if it is not +# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned +# shared libraries, empty otherwise. +# +# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following: +# +# -DLEVELDB_CSTDATOMIC_PRESENT if is present +# -DLEVELDB_PLATFORM_POSIX for Posix-based platforms +# -DSNAPPY if the Snappy library is present +# OUTPUT=$1 if test -z "$OUTPUT"; then - echo "usage: $0 " + echo "usage: $0 " >&2 exit 1 fi @@ -23,6 +32,10 @@ fi rm -f $OUTPUT touch $OUTPUT +if test -z "$CC"; then + CC=cc +fi + if test -z "$CXX"; then CXX=g++ fi @@ -33,12 +46,14 @@ if test -z "$TARGET_OS"; then fi COMMON_FLAGS= +CROSS_COMPILE= PLATFORM_CCFLAGS= PLATFORM_CXXFLAGS= PLATFORM_LDFLAGS= PLATFORM_SHARED_EXT="so" PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," PLATFORM_SHARED_CFLAGS="-fPIC" +PLATFORM_SHARED_VERSIONED=true # On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp case "$TARGET_OS" in @@ -86,13 +101,14 @@ case "$TARGET_OS" in PORT_FILE=port/port_posix.cc ;; OS_ANDROID_CROSSCOMPILE) - PLATFORM="$TARGET_OS" - COMMON_FLAGS="" - PLATFORM_LDFLAGS="" - PORT_FILE=port/port_android.cc + PLATFORM=OS_ANDROID + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" + PLATFORM_LDFLAGS="" # All pthread features are in the Android C library + PORT_FILE=port/port_posix.cc + CROSS_COMPILE=true ;; *) - echo "Unknown platform!" + echo "Unknown platform!" >&2 exit 1 esac @@ -112,7 +128,7 @@ set +f # re-enable globbing echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT -if [ "$PLATFORM" = "OS_ANDROID_CROSSCOMPILE" ]; then +if [ "$CROSS_COMPILE" = "true" ]; then # Cross-compiling; do not try any compilation tests. true else @@ -151,6 +167,8 @@ fi PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" +echo "CC=$CC" >> $OUTPUT +echo "CXX=$CXX" >> $OUTPUT echo "PLATFORM=$PLATFORM" >> $OUTPUT echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT @@ -158,3 +176,4 @@ echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT diff --git a/db/c_test.c b/db/c_test.c index 12b4424..9792447 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -19,6 +19,13 @@ static void StartPhase(const char* name) { phase = name; } +static const char* GetTempDir(void) { + const char* ret = getenv("TEST_TMPDIR"); + if (ret == NULL || ret[0] == '\0') + ret = "/tmp"; + return ret; +} + #define CheckNoError(err) \ if ((err) != NULL) { \ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ @@ -158,7 +165,9 @@ int main(int argc, char** argv) { char* err = NULL; int run = -1; - snprintf(dbname, sizeof(dbname), "/tmp/leveldb_c_test-%d", + snprintf(dbname, sizeof(dbname), + "%s/leveldb_c_test-%d", + GetTempDir(), ((int) geteuid())); StartPhase("create_objects"); diff --git a/db/db_bench.cc b/db/db_bench.cc index b0c3995..21d3e25 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -100,7 +100,7 @@ static int FLAGS_bloom_bits = -1; static bool FLAGS_use_existing_db = false; // Use the db with the following name. -static const char* FLAGS_db = "/tmp/dbbench"; +static const char* FLAGS_db = NULL; namespace leveldb { @@ -925,6 +925,7 @@ class Benchmark { int main(int argc, char** argv) { FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; FLAGS_open_files = leveldb::Options().max_open_files; + std::string default_db_path; for (int i = 1; i < argc; i++) { double d; @@ -964,6 +965,13 @@ int main(int argc, char** argv) { } } + // Choose a location for the test database if none given with --db= + if (FLAGS_db == NULL) { + leveldb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbbench"; + FLAGS_db = default_db_path.c_str(); + } + leveldb::Benchmark benchmark; benchmark.Run(); return 0; diff --git a/db/db_impl.cc b/db/db_impl.cc index c9c9023..90c1c81 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -608,8 +608,21 @@ void DBImpl::BackgroundCall() { MutexLock l(&mutex_); assert(bg_compaction_scheduled_); if (!shutting_down_.Acquire_Load()) { - BackgroundCompaction(); + Status s = BackgroundCompaction(); + if (!s.ok()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + Log(options_.info_log, "Waiting after background compaction error: %s", + s.ToString().c_str()); + mutex_.Unlock(); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } } + bg_compaction_scheduled_ = false; // Previous compaction may have produced too many files in a level, @@ -618,12 +631,11 @@ void DBImpl::BackgroundCall() { bg_cv_.SignalAll(); } -void DBImpl::BackgroundCompaction() { +Status DBImpl::BackgroundCompaction() { mutex_.AssertHeld(); if (imm_ != NULL) { - CompactMemTable(); - return; + return CompactMemTable(); } Compaction* c; @@ -698,6 +710,7 @@ void DBImpl::BackgroundCompaction() { } manual_compaction_ = NULL; } + return status; } void DBImpl::CleanupCompaction(CompactionState* compact) { @@ -1263,6 +1276,8 @@ Status DBImpl::MakeRoomForWrite(bool force) { WritableFile* lfile = NULL; s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); if (!s.ok()) { + // Avoid chewing through file number space in a tight loop. + versions_->ReuseFileNumber(new_log_number); break; } delete log_; diff --git a/db/db_impl.h b/db/db_impl.h index 2f8b523..8d2bb34 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -94,7 +94,7 @@ class DBImpl : public DB { void MaybeScheduleCompaction(); static void BGWork(void* db); void BackgroundCall(); - void BackgroundCompaction(); + Status BackgroundCompaction(); void CleanupCompaction(CompactionState* compact); Status DoCompactionWork(CompactionState* compact); diff --git a/db/db_test.cc b/db/db_test.cc index ee10807..3744d0e 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -56,12 +56,18 @@ class SpecialEnv : public EnvWrapper { // Simulate no-space errors while this pointer is non-NULL. port::AtomicPointer no_space_; + // Simulate non-writable file system while this pointer is non-NULL + port::AtomicPointer non_writable_; + bool count_random_reads_; AtomicCounter random_read_counter_; + AtomicCounter sleep_counter_; + explicit SpecialEnv(Env* base) : EnvWrapper(base) { delay_sstable_sync_.Release_Store(NULL); no_space_.Release_Store(NULL); + non_writable_.Release_Store(NULL); count_random_reads_ = false; } @@ -95,6 +101,10 @@ class SpecialEnv : public EnvWrapper { } }; + if (non_writable_.Acquire_Load() != NULL) { + return Status::IOError("simulated write error"); + } + Status s = target()->NewWritableFile(f, r); if (s.ok()) { if (strstr(f.c_str(), ".sst") != NULL) { @@ -127,6 +137,11 @@ class SpecialEnv : public EnvWrapper { } return s; } + + virtual void SleepForMicroseconds(int micros) { + sleep_counter_.Increment(); + target()->SleepForMicroseconds(micros); + } }; class DBTest { @@ -137,6 +152,7 @@ class DBTest { enum OptionConfig { kDefault, kFilter, + kUncompressed, kEnd }; int option_config_; @@ -167,10 +183,10 @@ class DBTest { // Switch to a fresh database with the next option configuration to // test. Return false if there are no more configurations to test. bool ChangeOptions() { - if (option_config_ == kEnd) { + option_config_++; + if (option_config_ >= kEnd) { return false; } else { - option_config_++; DestroyAndReopen(); return true; } @@ -183,6 +199,9 @@ class DBTest { case kFilter: options.filter_policy = filter_policy_; break; + case kUncompressed: + options.compression = kNoCompression; + break; default: break; } @@ -552,13 +571,15 @@ TEST(DBTest, GetEncountersEmptyLevel) { ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 1); - // Step 3: read until level 0 compaction disappears. - int read_count = 0; - while (NumTableFilesAtLevel(0) > 0) { - ASSERT_LE(read_count, 10000) << "did not trigger level 0 compaction"; - read_count++; + // Step 3: read a bunch of times + for (int i = 0; i < 1000; i++) { ASSERT_EQ("NOT_FOUND", Get("missing")); } + + // Step 4: Wait for compaction to finish + env_->SleepForMicroseconds(1000000); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); } while (ChangeOptions()); } @@ -1432,13 +1453,37 @@ TEST(DBTest, NoSpace) { Compact("a", "z"); const int num_files = CountFiles(); env_->no_space_.Release_Store(env_); // Force out-of-space errors - for (int i = 0; i < 10; i++) { + env_->sleep_counter_.Reset(); + for (int i = 0; i < 5; i++) { for (int level = 0; level < config::kNumLevels-1; level++) { dbfull()->TEST_CompactRange(level, NULL, NULL); } } env_->no_space_.Release_Store(NULL); - ASSERT_LT(CountFiles(), num_files + 5); + ASSERT_LT(CountFiles(), num_files + 3); + + // Check that compaction attempts slept after errors + ASSERT_GE(env_->sleep_counter_.Read(), 5); +} + +TEST(DBTest, NonWritableFileSystem) { + Options options = CurrentOptions(); + options.write_buffer_size = 1000; + options.env = env_; + Reopen(&options); + ASSERT_OK(Put("foo", "v1")); + env_->non_writable_.Release_Store(env_); // Force errors for new files + std::string big(100000, 'x'); + int errors = 0; + for (int i = 0; i < 20; i++) { + fprintf(stderr, "iter %d; errors %d\n", i, errors); + if (!Put("foo", big).ok()) { + errors++; + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(errors, 0); + env_->non_writable_.Release_Store(NULL); } TEST(DBTest, FilesDeletedAfterCompaction) { diff --git a/db/version_set.cc b/db/version_set.cc index 1f48419..cf976b4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -132,7 +132,7 @@ bool SomeFileOverlapsRange( const Comparator* ucmp = icmp.user_comparator(); if (!disjoint_sorted_files) { // Need to check against all files - for (int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { const FileMetaData* f = files[i]; if (AfterFile(ucmp, smallest_user_key, f) || BeforeFile(ucmp, largest_user_key, f)) { @@ -1297,7 +1297,7 @@ Compaction* VersionSet::CompactRange( // Avoid compacting too much in one shot in case the range is large. const uint64_t limit = MaxFileSizeForLevel(level); uint64_t total = 0; - for (int i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < inputs.size(); i++) { uint64_t s = inputs[i]->file_size; total += s; if (total >= limit) { diff --git a/db/version_set.h b/db/version_set.h index 572602e..61c4c99 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -173,6 +173,15 @@ class VersionSet { // Allocate and return a new file number uint64_t NewFileNumber() { return next_file_number_++; } + // Arrange to reuse "file_number" unless a newer file number has + // already been allocated. + // REQUIRES: "file_number" was returned by a call to NewFileNumber(). + void ReuseFileNumber(uint64_t file_number) { + if (next_file_number_ == file_number + 1) { + next_file_number_ = file_number; + } + } + // Return the number of Table files at the specified level. int NumLevelFiles(int level) const; diff --git a/doc/bench/db_bench_sqlite3.cc b/doc/bench/db_bench_sqlite3.cc index 6951a14..256793a 100644 --- a/doc/bench/db_bench_sqlite3.cc +++ b/doc/bench/db_bench_sqlite3.cc @@ -75,6 +75,9 @@ static bool FLAGS_transaction = true; // If true, we enable Write-Ahead Logging static bool FLAGS_WAL_enabled = true; +// Use the db with the following name. +static const char* FLAGS_db = NULL; + inline static void ExecErrorCheck(int status, char *err_msg) { if (status != SQLITE_OK) { @@ -317,11 +320,16 @@ class Benchmark { bytes_(0), rand_(301) { std::vector files; - Env::Default()->GetChildren("/tmp", &files); + std::string test_dir; + Env::Default()->GetTestDirectory(&test_dir); + Env::Default()->GetChildren(test_dir, &files); if (!FLAGS_use_existing_db) { for (int i = 0; i < files.size(); i++) { if (Slice(files[i]).starts_with("dbbench_sqlite3")) { - Env::Default()->DeleteFile("/tmp/" + files[i]); + std::string file_name(test_dir); + file_name += "/"; + file_name += files[i]; + Env::Default()->DeleteFile(file_name.c_str()); } } } @@ -415,7 +423,11 @@ class Benchmark { db_num_++; // Open database - snprintf(file_name, sizeof(file_name), "/tmp/dbbench_sqlite3-%d.db", + std::string tmp_dir; + Env::Default()->GetTestDirectory(&tmp_dir); + snprintf(file_name, sizeof(file_name), + "%s/dbbench_sqlite3-%d.db", + tmp_dir.c_str(), db_num_); status = sqlite3_open(file_name, &db_); if (status) { @@ -655,6 +667,7 @@ class Benchmark { } // namespace leveldb int main(int argc, char** argv) { + std::string default_db_path; for (int i = 1; i < argc; i++) { double d; int n; @@ -684,12 +697,21 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--WAL_enabled=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_WAL_enabled = n; + } else if (strncmp(argv[i], "--db=", 5) == 0) { + FLAGS_db = argv[i] + 5; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); } } + // Choose a location for the test database if none given with --db= + if (FLAGS_db == NULL) { + leveldb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbbench"; + FLAGS_db = default_db_path.c_str(); + } + leveldb::Benchmark benchmark; benchmark.Run(); return 0; diff --git a/doc/bench/db_bench_tree_db.cc b/doc/bench/db_bench_tree_db.cc index 214d9b7..ed86f03 100644 --- a/doc/bench/db_bench_tree_db.cc +++ b/doc/bench/db_bench_tree_db.cc @@ -68,6 +68,9 @@ static bool FLAGS_use_existing_db = false; // is off. static bool FLAGS_compression = true; +// Use the db with the following name. +static const char* FLAGS_db = NULL; + inline static void DBSynchronize(kyotocabinet::TreeDB* db_) { @@ -292,11 +295,16 @@ class Benchmark { bytes_(0), rand_(301) { std::vector files; - Env::Default()->GetChildren("/tmp", &files); + std::string test_dir; + Env::Default()->GetTestDirectory(&test_dir); + Env::Default()->GetChildren(test_dir.c_str(), &files); if (!FLAGS_use_existing_db) { for (int i = 0; i < files.size(); i++) { if (Slice(files[i]).starts_with("dbbench_polyDB")) { - Env::Default()->DeleteFile("/tmp/" + files[i]); + std::string file_name(test_dir); + file_name += "/"; + file_name += files[i]; + Env::Default()->DeleteFile(file_name.c_str()); } } } @@ -385,8 +393,12 @@ class Benchmark { db_ = new kyotocabinet::TreeDB(); char file_name[100]; db_num_++; - snprintf(file_name, sizeof(file_name), "/tmp/dbbench_polyDB-%d.kct", - db_num_); + std::string test_dir; + Env::Default()->GetTestDirectory(&test_dir); + snprintf(file_name, sizeof(file_name), + "%s/dbbench_polyDB-%d.kct", + test_dir.c_str(), + db_num_); // Create tuning options and open the database int open_options = kyotocabinet::PolyDB::OWRITER | @@ -470,6 +482,7 @@ class Benchmark { } // namespace leveldb int main(int argc, char** argv) { + std::string default_db_path; for (int i = 1; i < argc; i++) { double d; int n; @@ -494,12 +507,21 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_compression = (n == 1) ? true : false; + } else if (strncmp(argv[i], "--db=", 5) == 0) { + FLAGS_db = argv[i] + 5; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); } } + // Choose a location for the test database if none given with --db= + if (FLAGS_db == NULL) { + leveldb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbbench"; + FLAGS_db = default_db_path.c_str(); + } + leveldb::Benchmark benchmark; benchmark.Run(); return 0; diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h index 35ae550..c58bffb 100644 --- a/port/atomic_pointer.h +++ b/port/atomic_pointer.h @@ -73,13 +73,21 @@ inline void MemoryBarrier() { } #define LEVELDB_HAVE_MEMORY_BARRIER -// ARM -#elif defined(ARCH_CPU_ARM_FAMILY) +// ARM Linux +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__) typedef void (*LinuxKernelMemoryBarrierFunc)(void); -LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier __attribute__((weak)) = - (LinuxKernelMemoryBarrierFunc) 0xffff0fa0; +// The Linux ARM kernel provides a highly optimized device-specific memory +// barrier function at a fixed memory address that is mapped in every +// user-level process. +// +// This beats using CPU-specific instructions which are, on single-core +// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more +// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking +// shows that the extra function call cost is completely negligible on +// multi-core devices. +// inline void MemoryBarrier() { - pLinuxKernelMemoryBarrier(); + (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)(); } #define LEVELDB_HAVE_MEMORY_BARRIER diff --git a/port/port.h b/port/port.h index 816826b..e667db4 100644 --- a/port/port.h +++ b/port/port.h @@ -14,8 +14,6 @@ # include "port/port_posix.h" #elif defined(LEVELDB_PLATFORM_CHROMIUM) # include "port/port_chromium.h" -#elif defined(LEVELDB_PLATFORM_ANDROID) -# include "port/port_android.h" #endif #endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_android.cc b/port/port_android.cc deleted file mode 100644 index 815abf2..0000000 --- a/port/port_android.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "port/port_android.h" - -#include - -extern "C" { -size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { - return fread(a, b, c, d); -} - -size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { - return fwrite(a, b, c, d); -} - -int fflush_unlocked(FILE *f) { - return fflush(f); -} - -int fdatasync(int fd) { - return fsync(fd); -} -} - -namespace leveldb { -namespace port { - -static void PthreadCall(const char* label, int result) { - if (result != 0) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } -} - -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } -void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } -void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } - -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); -} - -CondVar::~CondVar() { - PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); -} - -void CondVar::Wait() { - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); -} - -void CondVar::Signal(){ - PthreadCall("signal", pthread_cond_signal(&cv_)); -} - -void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); -} - -} // namespace port -} // namespace leveldb diff --git a/port/port_android.h b/port/port_android.h deleted file mode 100644 index b733388..0000000 --- a/port/port_android.h +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ -#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ - -#include -#include -#include -#include -#include -#include - -// Collapse the plethora of ARM flavors available to an easier to manage set -// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto -#if defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || \ - defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6Z__) || \ - defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7A__) -#define ARMV6_OR_7 1 -#endif - -extern "C" { - size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); - size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); - int fflush_unlocked(FILE *f); - int fdatasync (int fd); -} - -namespace leveldb { -namespace port { - -static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; - -class CondVar; - -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld() { - //TODO(gabor): How can I implement this? - } - - private: - friend class CondVar; - pthread_mutex_t mu_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - private: - Mutex* mu_; - pthread_cond_t cv_; -}; - -#ifndef ARMV6_OR_7 -// On ARM chipsets = V6 -#ifdef ARMV6_OR_7 - __asm__ __volatile__("dmb" : : : "memory"); -#else - pLinuxKernelMemoryBarrier(); -#endif - } - - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - void* r = rep_; - MemoryBarrier(); - return r; - } - inline void Release_Store(void* v) { - MemoryBarrier(); - rep_ = v; - } - inline void* NoBarrier_Load() const { - void* r = rep_; - return r; - } - inline void NoBarrier_Store(void* v) { - rep_ = v; - } -}; - -// TODO(gabor): Implement compress -inline bool Snappy_Compress( - const char* input, - size_t input_length, - std::string* output) { - return false; -} - -// TODO(gabor): Implement uncompress -inline bool Snappy_GetUncompressedLength(const char* input, size_t length, - size_t* result) { - return false; -} - -// TODO(gabor): Implement uncompress -inline bool Snappy_Uncompress( - const char* input_data, - size_t input_length, - char* output) { - return false; -} - -inline uint64_t ThreadIdentifier() { - pthread_t tid = pthread_self(); - uint64_t r = 0; - memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); - return r; -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -} // namespace port -} // namespace leveldb - -#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/port/port_example.h b/port/port_example.h index 036c7d1..ab9e489 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -60,6 +60,16 @@ class CondVar { void SignallAll(); }; +// Thread-safe initialization. +// Used as follows: +// static port::OnceType init_control = LEVELDB_ONCE_INIT; +// static void Initializer() { ... do something ...; } +// ... +// port::InitOnce(&init_control, &Initializer); +typedef intptr_t OnceType; +#define LEVELDB_ONCE_INIT 0 +extern void InitOnce(port::OnceType*, void (*initializer)()); + // A type that holds a pointer that can be read or written atomically // (i.e., without word-tearing.) class AtomicPointer { diff --git a/port/port_posix.cc b/port/port_posix.cc index c44cc99..5ba127a 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -46,5 +46,9 @@ void CondVar::SignalAll() { PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); } +void InitOnce(OnceType* once, void (*initializer)()) { + PthreadCall("once", pthread_once(once, initializer)); +} + } // namespace port } // namespace leveldb diff --git a/port/port_posix.h b/port/port_posix.h index 485ad10..654a4b9 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -7,17 +7,22 @@ #ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ #define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#undef PLATFORM_IS_LITTLE_ENDIAN #if defined(OS_MACOSX) #include + #if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER) + #define PLATFORM_IS_LITTLE_ENDIAN \ + (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN) + #endif #elif defined(OS_SOLARIS) #include #ifdef _LITTLE_ENDIAN - #define LITTLE_ENDIAN + #define PLATFORM_IS_LITTLE_ENDIAN true #else - #define BIG_ENDIAN + #define PLATFORM_IS_LITTLE_ENDIAN false #endif #elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\ - defined(OS_DRAGONFLYBSD) + defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) #include #include #else @@ -31,14 +36,13 @@ #include #include "port/atomic_pointer.h" -#ifdef LITTLE_ENDIAN -#define IS_LITTLE_ENDIAN true -#else -#define IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) +#ifndef PLATFORM_IS_LITTLE_ENDIAN +#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) #endif #if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ - defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) + defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ + defined(OS_ANDROID) // Use fread/fwrite/fflush on platforms without _unlocked variants #define fread_unlocked fread #define fwrite_unlocked fwrite @@ -51,10 +55,17 @@ #define fdatasync fsync #endif +#if defined(OS_ANDROID) && __ANDROID_API__ < 9 +// fdatasync() was only introduced in API level 9 on Android. Use fsync() +// when targetting older platforms. +#define fdatasync fsync +#endif + namespace leveldb { namespace port { -static const bool kLittleEndian = IS_LITTLE_ENDIAN; +static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN; +#undef PLATFORM_IS_LITTLE_ENDIAN class CondVar; @@ -88,6 +99,10 @@ class CondVar { Mutex* mu_; }; +typedef pthread_once_t OnceType; +#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT +extern void InitOnce(OnceType* once, void (*initializer)()); + inline bool Snappy_Compress(const char* input, size_t length, ::std::string* output) { #ifdef SNAPPY diff --git a/util/coding_test.cc b/util/coding_test.cc index 4cc856c..2c52b17 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -51,6 +51,29 @@ TEST(Coding, Fixed64) { } } +// Test that encoding routines generate little-endian encodings +TEST(Coding, EncodingOutput) { + std::string dst; + PutFixed32(&dst, 0x04030201); + ASSERT_EQ(4, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + + dst.clear(); + PutFixed64(&dst, 0x0807060504030201ull); + ASSERT_EQ(8, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + ASSERT_EQ(0x05, static_cast(dst[4])); + ASSERT_EQ(0x06, static_cast(dst[5])); + ASSERT_EQ(0x07, static_cast(dst[6])); + ASSERT_EQ(0x08, static_cast(dst[7])); +} + TEST(Coding, Varint32) { std::string s; for (uint32_t i = 0; i < (32 * 32); i++) { diff --git a/util/comparator.cc b/util/comparator.cc index eed9d2f..4b7b572 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -6,6 +6,7 @@ #include #include "leveldb/comparator.h" #include "leveldb/slice.h" +#include "port/port.h" #include "util/logging.h" namespace leveldb { @@ -65,11 +66,15 @@ class BytewiseComparatorImpl : public Comparator { }; } // namespace -// Intentionally not destroyed to prevent destructor racing -// with background threads. -static const Comparator* bytewise = new BytewiseComparatorImpl; +static port::OnceType once = LEVELDB_ONCE_INIT; +static const Comparator* bytewise; + +static void InitModule() { + bytewise = new BytewiseComparatorImpl; +} const Comparator* BytewiseComparator() { + port::InitOnce(&once, InitModule); return bytewise; } -- cgit v1.2.3 From 77ecdbbe2c5eb06696658955508e30fd0a1ebebe Mon Sep 17 00:00:00 2001 From: "sanjay@google.com" Date: Wed, 30 May 2012 16:58:57 +0000 Subject: update version number to 1.5 git-svn-id: http://leveldb.googlecode.com/svn/trunk@67 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 2 +- include/leveldb/db.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5cd9d37..c648a28 100644 --- a/Makefile +++ b/Makefile @@ -70,7 +70,7 @@ SHARED = $(SHARED1) else # Update db.h if you change these. SHARED_MAJOR = 1 -SHARED_MINOR = 4 +SHARED_MINOR = 5 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 481aad6..ed56b87 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -14,7 +14,7 @@ namespace leveldb { // Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 4; +static const int kMinorVersion = 5; struct Options; struct ReadOptions; -- cgit v1.2.3 From 29c68f16466b1704dc7a663cf51bf9c5579830c3 Mon Sep 17 00:00:00 2001 From: "dgrogan@chromium.org" Date: Fri, 12 Oct 2012 19:13:17 +0000 Subject: Update to leveldb 1.6 Highlights ---------- Mmap at most 1000 files on Posix to improve performance for large databases. Support for more architectures (thanks to Alexander K.) Building and porting -------------------- HP/UX support (issue 126) AtomicPointer for ia64 (issue 123) Sparc v9 support (issue 124) Atomic ops for powerpc Use -fno-builtin-memcmp only when using g++ Simplify IOS build rules (issue 114) Use CXXFLAGS instead of CFLAGS when invoking C++ compiler (issue 118) Fix snappy shared library problem (issue 94) Fix shared library installation path regression Endian-ness detection tweak for FreeBSD Bug fixes --------- Stop ignoring FLAGS_open_files in db_bench Make bloom test behavior agnostic to endian-ness Performance ----------- Limit number of mmapped files to 1000 to improve perf for large dbs Do not delay for 1 second on shutdown path (issue 125) Misc ---- Make InMemoryEnv return a no-op logger C binding now has a wrapper for free (issue 117) Add thread-safety annotations Added an in-process lock table (issue 120) Make RandomAccessFile and SequentialFile non-copyable git-svn-id: http://leveldb.googlecode.com/svn/trunk@68 62dab493-f737-651d-591e-8d6aee1b9529 --- Makefile | 11 ++--- TODO | 1 + build_detect_platform | 49 +++++++++++++------- db/c.cc | 4 ++ db/c_test.c | 6 +++ db/db_bench.cc | 1 + db/db_impl.cc | 6 ++- db/db_impl.h | 28 +++++++----- db/db_test.cc | 6 +++ db/version_set.cc | 2 +- db/version_set.h | 4 +- doc/bench/db_bench_sqlite3.cc | 2 +- doc/index.html | 2 +- helpers/memenv/memenv.cc | 10 +++++ include/leveldb/c.h | 10 +++++ include/leveldb/db.h | 2 +- include/leveldb/env.h | 10 +++++ port/atomic_pointer.h | 72 ++++++++++++++++++++++++++++++ port/port_posix.h | 11 ++++- port/thread_annotations.h | 59 ++++++++++++++++++++++++ table/block.cc | 4 +- util/bloom_test.cc | 5 ++- util/coding.cc | 40 ++++++++--------- util/env_posix.cc | 101 +++++++++++++++++++++++++++++++++++++++--- util/mutexlock.h | 8 ++-- 25 files changed, 382 insertions(+), 72 deletions(-) create mode 100644 port/thread_annotations.h diff --git a/Makefile b/Makefile index c648a28..4dd1366 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,8 @@ OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode) #----------------------------------------------- # detect what platform we're building on -$(shell ./build_detect_platform build_config.mk) +$(shell CC=$(CC) CXX=$(CXX) TARGET_OS=$(TARGET_OS) \ + ./build_detect_platform build_config.mk ./) # this file is generated by the previous line to set build flags and sources include build_config.mk @@ -70,7 +71,7 @@ SHARED = $(SHARED1) else # Update db.h if you change these. SHARED_MAJOR = 1 -SHARED_MINOR = 5 +SHARED_MINOR = 6 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) @@ -82,7 +83,7 @@ $(SHARED2): $(SHARED3) endif $(SHARED3): - $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3) + $(CXX) $(SOURCES) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(INSTALL_PATH)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -o $(SHARED3) endif # PLATFORM_SHARED_EXT @@ -179,14 +180,14 @@ IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBu .cc.o: mkdir -p ios-x86/$(dir $@) - $(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ + $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ .c.o: mkdir -p ios-x86/$(dir $@) - $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ + $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ diff --git a/TODO b/TODO index 9130b6a..e603c07 100644 --- a/TODO +++ b/TODO @@ -7,6 +7,7 @@ db within [start_key..end_key]? For Chrome, deletion of obsolete object stores, etc. can be done in the background anyway, so probably not that important. +- There have been requests for MultiGet. After a range is completely deleted, what gets rid of the corresponding files if we do no future changes to that range. Make diff --git a/build_detect_platform b/build_detect_platform index 959a7d6..83bbe42 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -23,8 +23,9 @@ # OUTPUT=$1 -if test -z "$OUTPUT"; then - echo "usage: $0 " >&2 +PREFIX=$2 +if test -z "$OUTPUT" || test -z "$PREFIX"; then + echo "usage: $0 " >&2 exit 1 fi @@ -55,58 +56,72 @@ PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," PLATFORM_SHARED_CFLAGS="-fPIC" PLATFORM_SHARED_VERSIONED=true -# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp +MEMCMP_FLAG= +if [ "$CXX" = "g++" ]; then + # Use libc's memcmp instead of GCC's memcmp. This results in ~40% + # performance improvement on readrandom under gcc 4.4.3 on Linux/x86. + MEMCMP_FLAG="-fno-builtin-memcmp" +fi + case "$TARGET_OS" in Darwin) PLATFORM=OS_MACOSX - COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX" + COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX" PLATFORM_SHARED_EXT=dylib PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " PORT_FILE=port/port_posix.cc ;; Linux) PLATFORM=OS_LINUX - COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX" + COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX" PLATFORM_LDFLAGS="-pthread" PORT_FILE=port/port_posix.cc ;; SunOS) PLATFORM=OS_SOLARIS - COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS" + COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS" PLATFORM_LDFLAGS="-lpthread -lrt" PORT_FILE=port/port_posix.cc ;; FreeBSD) PLATFORM=OS_FREEBSD - COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD" + COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD" PLATFORM_LDFLAGS="-lpthread" PORT_FILE=port/port_posix.cc ;; NetBSD) PLATFORM=OS_NETBSD - COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" + COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD" PLATFORM_LDFLAGS="-lpthread -lgcc_s" PORT_FILE=port/port_posix.cc ;; OpenBSD) PLATFORM=OS_OPENBSD - COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD" + COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD" PLATFORM_LDFLAGS="-pthread" PORT_FILE=port/port_posix.cc ;; DragonFly) PLATFORM=OS_DRAGONFLYBSD - COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD" + COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD" PLATFORM_LDFLAGS="-lpthread" PORT_FILE=port/port_posix.cc ;; OS_ANDROID_CROSSCOMPILE) PLATFORM=OS_ANDROID - COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" + COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" PLATFORM_LDFLAGS="" # All pthread features are in the Android C library PORT_FILE=port/port_posix.cc CROSS_COMPILE=true ;; + HP-UX) + PLATFORM=OS_HPUX + COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX" + PLATFORM_LDFLAGS="-pthread" + PORT_FILE=port/port_posix.cc + # man ld: +h internal_name + PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl," + ;; *) echo "Unknown platform!" >&2 exit 1 @@ -116,11 +131,13 @@ esac # except for the test and benchmark files. By default, find will output a list # of all files matching either rule, so we need to append -print to make the # prune take effect. -DIRS="util db table" +DIRS="$PREFIX/db $PREFIX/util $PREFIX/table" + set -f # temporarily disable globbing so that our patterns aren't expanded PRUNE_TEST="-name *test*.cc -prune" PRUNE_BENCH="-name *_bench.cc -prune" -PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "` +PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "` + set +f # re-enable globbing # The sources consist of the portable files, plus the platform-specific port @@ -133,7 +150,7 @@ if [ "$CROSS_COMPILE" = "true" ]; then true else # If -std=c++0x works, use . Otherwise use port_posix.h. - $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -146,7 +163,7 @@ EOF # Test whether Snappy library is installed # http://code.google.com/p/snappy/ - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -156,7 +173,7 @@ EOF fi # Test whether tcmalloc is available - $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null </dev/null <Name()) { s = Status::InvalidArgument( - edit.comparator_ + "does not match existing comparator ", + edit.comparator_ + " does not match existing comparator ", icmp_.user_comparator()->Name()); } } diff --git a/db/version_set.h b/db/version_set.h index 61c4c99..792899b 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -21,6 +21,7 @@ #include "db/dbformat.h" #include "db/version_edit.h" #include "port/port.h" +#include "port/thread_annotations.h" namespace leveldb { @@ -159,7 +160,8 @@ class VersionSet { // current version. Will release *mu while actually writing to the file. // REQUIRES: *mu is held on entry. // REQUIRES: no other thread concurrently calls LogAndApply() - Status LogAndApply(VersionEdit* edit, port::Mutex* mu); + Status LogAndApply(VersionEdit* edit, port::Mutex* mu) + EXCLUSIVE_LOCKS_REQUIRED(mu); // Recover the last saved descriptor from persistent storage. Status Recover(); diff --git a/doc/bench/db_bench_sqlite3.cc b/doc/bench/db_bench_sqlite3.cc index 256793a..e63aaa8 100644 --- a/doc/bench/db_bench_sqlite3.cc +++ b/doc/bench/db_bench_sqlite3.cc @@ -618,7 +618,7 @@ class Benchmark { ErrorCheck(status); // Execute read statement - while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW); + while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {} StepErrorCheck(status); // Reset SQLite statement for another use diff --git a/doc/index.html b/doc/index.html index 521d2ba..cd29341 100644 --- a/doc/index.html +++ b/doc/index.html @@ -408,7 +408,7 @@ The optional FilterPolicy mechanism can be used to reduce the number of disk reads substantially.

          leveldb::Options options;
      -   options.filter_policy = NewBloomFilter(10);
      +   options.filter_policy = NewBloomFilterPolicy(10);
          leveldb::DB* db;
          leveldb::DB::Open(options, "/tmp/testdb", &db);
          ... use the database ...
      diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc
      index 2082083..5879de1 100644
      --- a/helpers/memenv/memenv.cc
      +++ b/helpers/memenv/memenv.cc
      @@ -221,6 +221,11 @@ class WritableFileImpl : public WritableFile {
         FileState* file_;
       };
       
      +class NoOpLogger : public Logger {
      + public:
      +  virtual void Logv(const char* format, va_list ap) { }
      +};
      +
       class InMemoryEnv : public EnvWrapper {
        public:
         explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
      @@ -358,6 +363,11 @@ class InMemoryEnv : public EnvWrapper {
           return Status::OK();
         }
       
      +  virtual Status NewLogger(const std::string& fname, Logger** result) {
      +    *result = new NoOpLogger;
      +    return Status::OK();
      +  }
      +
        private:
         // Map from filenames to FileState objects, representing a simple file system.
         typedef std::map FileSystem;
      diff --git a/include/leveldb/c.h b/include/leveldb/c.h
      index 70e3cc6..9cee971 100644
      --- a/include/leveldb/c.h
      +++ b/include/leveldb/c.h
      @@ -28,6 +28,7 @@
         be true on entry:
            *errptr == NULL
            *errptr points to a malloc()ed null-terminated error message
      +       (On Windows, *errptr must have been malloc()-ed by this library.)
         On success, a leveldb routine leaves *errptr unchanged.
         On failure, leveldb frees the old value of *errptr and
         set *errptr to a malloc()ed error message.
      @@ -268,6 +269,15 @@ extern void leveldb_cache_destroy(leveldb_cache_t* cache);
       extern leveldb_env_t* leveldb_create_default_env();
       extern void leveldb_env_destroy(leveldb_env_t*);
       
      +/* Utility */
      +
      +/* Calls free(ptr).
      +   REQUIRES: ptr was malloc()-ed and returned by one of the routines
      +   in this file.  Note that in certain cases (typically on Windows), you
      +   may need to call this routine instead of free(ptr) to dispose of
      +   malloc()-ed memory returned by this library. */
      +extern void leveldb_free(void* ptr);
      +
       #ifdef __cplusplus
       }  /* end extern "C" */
       #endif
      diff --git a/include/leveldb/db.h b/include/leveldb/db.h
      index ed56b87..a50ac69 100644
      --- a/include/leveldb/db.h
      +++ b/include/leveldb/db.h
      @@ -14,7 +14,7 @@ namespace leveldb {
       
       // Update Makefile if you change these
       static const int kMajorVersion = 1;
      -static const int kMinorVersion = 5;
      +static const int kMinorVersion = 6;
       
       struct Options;
       struct ReadOptions;
      diff --git a/include/leveldb/env.h b/include/leveldb/env.h
      index 2720667..fa32289 100644
      --- a/include/leveldb/env.h
      +++ b/include/leveldb/env.h
      @@ -175,6 +175,11 @@ class SequentialFile {
         //
         // REQUIRES: External synchronization
         virtual Status Skip(uint64_t n) = 0;
      +
      + private:
      +  // No copying allowed
      +  SequentialFile(const SequentialFile&);
      +  void operator=(const SequentialFile&);
       };
       
       // A file abstraction for randomly reading the contents of a file.
      @@ -194,6 +199,11 @@ class RandomAccessFile {
         // Safe for concurrent use by multiple threads.
         virtual Status Read(uint64_t offset, size_t n, Slice* result,
                             char* scratch) const = 0;
      +
      + private:
      +  // No copying allowed
      +  RandomAccessFile(const RandomAccessFile&);
      +  void operator=(const RandomAccessFile&);
       };
       
       // A file abstraction for sequential writing.  The implementation
      diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h
      index c58bffb..e17bf43 100644
      --- a/port/atomic_pointer.h
      +++ b/port/atomic_pointer.h
      @@ -36,6 +36,8 @@
       #define ARCH_CPU_X86_FAMILY 1
       #elif defined(__ARMEL__)
       #define ARCH_CPU_ARM_FAMILY 1
      +#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
      +#define ARCH_CPU_PPC_FAMILY 1
       #endif
       
       namespace leveldb {
      @@ -91,6 +93,15 @@ inline void MemoryBarrier() {
       }
       #define LEVELDB_HAVE_MEMORY_BARRIER
       
      +// PPC
      +#elif defined(ARCH_CPU_PPC_FAMILY) && defined(__GNUC__)
      +inline void MemoryBarrier() {
      +  // TODO for some powerpc expert: is there a cheaper suitable variant?
      +  // Perhaps by having separate barriers for acquire and release ops.
      +  asm volatile("sync" : : : "memory");
      +}
      +#define LEVELDB_HAVE_MEMORY_BARRIER
      +
       #endif
       
       // AtomicPointer built using platform-specific MemoryBarrier()
      @@ -136,6 +147,66 @@ class AtomicPointer {
         }
       };
       
      +// Atomic pointer based on sparc memory barriers
      +#elif defined(__sparcv9) && defined(__GNUC__)
      +class AtomicPointer {
      + private:
      +  void* rep_;
      + public:
      +  AtomicPointer() { }
      +  explicit AtomicPointer(void* v) : rep_(v) { }
      +  inline void* Acquire_Load() const {
      +    void* val;
      +    __asm__ __volatile__ (
      +        "ldx [%[rep_]], %[val] \n\t"
      +         "membar #LoadLoad|#LoadStore \n\t"
      +        : [val] "=r" (val)
      +        : [rep_] "r" (&rep_)
      +        : "memory");
      +    return val;
      +  }
      +  inline void Release_Store(void* v) {
      +    __asm__ __volatile__ (
      +        "membar #LoadStore|#StoreStore \n\t"
      +        "stx %[v], [%[rep_]] \n\t"
      +        :
      +        : [rep_] "r" (&rep_), [v] "r" (v)
      +        : "memory");
      +  }
      +  inline void* NoBarrier_Load() const { return rep_; }
      +  inline void NoBarrier_Store(void* v) { rep_ = v; }
      +};
      +
      +// Atomic pointer based on ia64 acq/rel
      +#elif defined(__ia64) && defined(__GNUC__)
      +class AtomicPointer {
      + private:
      +  void* rep_;
      + public:
      +  AtomicPointer() { }
      +  explicit AtomicPointer(void* v) : rep_(v) { }
      +  inline void* Acquire_Load() const {
      +    void* val    ;
      +    __asm__ __volatile__ (
      +        "ld8.acq %[val] = [%[rep_]] \n\t"
      +        : [val] "=r" (val)
      +        : [rep_] "r" (&rep_)
      +        : "memory"
      +        );
      +    return val;
      +  }
      +  inline void Release_Store(void* v) {
      +    __asm__ __volatile__ (
      +        "st8.rel [%[rep_]] = %[v]  \n\t"
      +        :
      +        : [rep_] "r" (&rep_), [v] "r" (v)
      +        : "memory"
      +        );
      +  }
      +  inline void* NoBarrier_Load() const { return rep_; }
      +  inline void NoBarrier_Store(void* v) { rep_ = v; }
      +};
      +
       // We have neither MemoryBarrier(), nor 
       #else
       #error Please implement AtomicPointer for this platform.
      @@ -145,6 +216,7 @@ class AtomicPointer {
       #undef LEVELDB_HAVE_MEMORY_BARRIER
       #undef ARCH_CPU_X86_FAMILY
       #undef ARCH_CPU_ARM_FAMILY
      +#undef ARCH_CPU_PPC_FAMILY
       
       }  // namespace port
       }  // namespace leveldb
      diff --git a/port/port_posix.h b/port/port_posix.h
      index 654a4b9..6ca352e 100644
      --- a/port/port_posix.h
      +++ b/port/port_posix.h
      @@ -21,13 +21,20 @@
         #else
           #define PLATFORM_IS_LITTLE_ENDIAN false
         #endif
      -#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\
      +#elif defined(OS_FREEBSD)
      +  #include 
      +  #include 
      +  #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN)
      +#elif defined(OS_OPENBSD) || defined(OS_NETBSD) ||\
             defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID)
         #include 
         #include 
      +#elif defined(OS_HPUX)
      +  #define PLATFORM_IS_LITTLE_ENDIAN false
       #else
         #include 
       #endif
      +
       #include 
       #ifdef SNAPPY
       #include 
      @@ -42,7 +49,7 @@
       
       #if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\
           defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\
      -    defined(OS_ANDROID)
      +    defined(OS_ANDROID) || defined(OS_HPUX)
       // Use fread/fwrite/fflush on platforms without _unlocked variants
       #define fread_unlocked fread
       #define fwrite_unlocked fwrite
      diff --git a/port/thread_annotations.h b/port/thread_annotations.h
      new file mode 100644
      index 0000000..6f9b6a7
      --- /dev/null
      +++ b/port/thread_annotations.h
      @@ -0,0 +1,59 @@
      +// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
      +// Use of this source code is governed by a BSD-style license that can be
      +// found in the LICENSE file. See the AUTHORS file for names of contributors.
      +
      +#ifndef STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H
      +
      +// Some environments provide custom macros to aid in static thread-safety
      +// analysis.  Provide empty definitions of such macros unless they are already
      +// defined.
      +
      +#ifndef EXCLUSIVE_LOCKS_REQUIRED
      +#define EXCLUSIVE_LOCKS_REQUIRED(...)
      +#endif
      +
      +#ifndef SHARED_LOCKS_REQUIRED
      +#define SHARED_LOCKS_REQUIRED(...)
      +#endif
      +
      +#ifndef LOCKS_EXCLUDED
      +#define LOCKS_EXCLUDED(...)
      +#endif
      +
      +#ifndef LOCK_RETURNED
      +#define LOCK_RETURNED(x)
      +#endif
      +
      +#ifndef LOCKABLE
      +#define LOCKABLE
      +#endif
      +
      +#ifndef SCOPED_LOCKABLE
      +#define SCOPED_LOCKABLE
      +#endif
      +
      +#ifndef EXCLUSIVE_LOCK_FUNCTION
      +#define EXCLUSIVE_LOCK_FUNCTION(...)
      +#endif
      +
      +#ifndef SHARED_LOCK_FUNCTION
      +#define SHARED_LOCK_FUNCTION(...)
      +#endif
      +
      +#ifndef EXCLUSIVE_TRYLOCK_FUNCTION
      +#define EXCLUSIVE_TRYLOCK_FUNCTION(...)
      +#endif
      +
      +#ifndef SHARED_TRYLOCK_FUNCTION
      +#define SHARED_TRYLOCK_FUNCTION(...)
      +#endif
      +
      +#ifndef UNLOCK_FUNCTION
      +#define UNLOCK_FUNCTION(...)
      +#endif
      +
      +#ifndef NO_THREAD_SAFETY_ANALYSIS
      +#define NO_THREAD_SAFETY_ANALYSIS
      +#endif
      +
      +#endif  // STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H
      diff --git a/table/block.cc b/table/block.cc
      index 199d453..ab83c11 100644
      --- a/table/block.cc
      +++ b/table/block.cc
      @@ -162,8 +162,8 @@ class Block::Iter : public Iterator {
         }
       
         virtual void Seek(const Slice& target) {
      -    // Binary search in restart array to find the first restart point
      -    // with a key >= target
      +    // Binary search in restart array to find the last restart point
      +    // with a key < target
           uint32_t left = 0;
           uint32_t right = num_restarts_ - 1;
           while (left < right) {
      diff --git a/util/bloom_test.cc b/util/bloom_test.cc
      index 4a6ea1b..0bf8e8d 100644
      --- a/util/bloom_test.cc
      +++ b/util/bloom_test.cc
      @@ -4,6 +4,7 @@
       
       #include "leveldb/filter_policy.h"
       
      +#include "util/coding.h"
       #include "util/logging.h"
       #include "util/testharness.h"
       #include "util/testutil.h"
      @@ -13,8 +14,8 @@ namespace leveldb {
       static const int kVerbose = 1;
       
       static Slice Key(int i, char* buffer) {
      -  memcpy(buffer, &i, sizeof(i));
      -  return Slice(buffer, sizeof(i));
      +  EncodeFixed32(buffer, i);
      +  return Slice(buffer, sizeof(uint32_t));
       }
       
       class BloomTest {
      diff --git a/util/coding.cc b/util/coding.cc
      index dbd7a65..21e3186 100644
      --- a/util/coding.cc
      +++ b/util/coding.cc
      @@ -7,29 +7,29 @@
       namespace leveldb {
       
       void EncodeFixed32(char* buf, uint32_t value) {
      -#if __BYTE_ORDER == __LITTLE_ENDIAN
      -  memcpy(buf, &value, sizeof(value));
      -#else
      -  buf[0] = value & 0xff;
      -  buf[1] = (value >> 8) & 0xff;
      -  buf[2] = (value >> 16) & 0xff;
      -  buf[3] = (value >> 24) & 0xff;
      -#endif
      +  if (port::kLittleEndian) {
      +    memcpy(buf, &value, sizeof(value));
      +  } else {
      +    buf[0] = value & 0xff;
      +    buf[1] = (value >> 8) & 0xff;
      +    buf[2] = (value >> 16) & 0xff;
      +    buf[3] = (value >> 24) & 0xff;
      +  }
       }
       
       void EncodeFixed64(char* buf, uint64_t value) {
      -#if __BYTE_ORDER == __LITTLE_ENDIAN
      -  memcpy(buf, &value, sizeof(value));
      -#else
      -  buf[0] = value & 0xff;
      -  buf[1] = (value >> 8) & 0xff;
      -  buf[2] = (value >> 16) & 0xff;
      -  buf[3] = (value >> 24) & 0xff;
      -  buf[4] = (value >> 32) & 0xff;
      -  buf[5] = (value >> 40) & 0xff;
      -  buf[6] = (value >> 48) & 0xff;
      -  buf[7] = (value >> 56) & 0xff;
      -#endif
      +  if (port::kLittleEndian) {
      +    memcpy(buf, &value, sizeof(value));
      +  } else {
      +    buf[0] = value & 0xff;
      +    buf[1] = (value >> 8) & 0xff;
      +    buf[2] = (value >> 16) & 0xff;
      +    buf[3] = (value >> 24) & 0xff;
      +    buf[4] = (value >> 32) & 0xff;
      +    buf[5] = (value >> 40) & 0xff;
      +    buf[6] = (value >> 48) & 0xff;
      +    buf[7] = (value >> 56) & 0xff;
      +  }
       }
       
       void PutFixed32(std::string* dst, uint32_t value) {
      diff --git a/util/env_posix.cc b/util/env_posix.cc
      index cb1f6fc..78e09c9 100644
      --- a/util/env_posix.cc
      +++ b/util/env_posix.cc
      @@ -3,6 +3,7 @@
       // found in the LICENSE file. See the AUTHORS file for names of contributors.
       
       #include 
      +#include 
       #include 
       #include 
       #include 
      @@ -23,6 +24,7 @@
       #include "leveldb/slice.h"
       #include "port/port.h"
       #include "util/logging.h"
      +#include "util/mutexlock.h"
       #include "util/posix_logger.h"
       
       namespace leveldb {
      @@ -90,18 +92,75 @@ class PosixRandomAccessFile: public RandomAccessFile {
         }
       };
       
      +// Helper class to limit mmap file usage so that we do not end up
      +// running out virtual memory or running into kernel performance
      +// problems for very large databases.
      +class MmapLimiter {
      + public:
      +  // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes.
      +  MmapLimiter() {
      +    SetAllowed(sizeof(void*) >= 8 ? 1000 : 0);
      +  }
      +
      +  // If another mmap slot is available, acquire it and return true.
      +  // Else return false.
      +  bool Acquire() {
      +    if (GetAllowed() <= 0) {
      +      return false;
      +    }
      +    MutexLock l(&mu_);
      +    intptr_t x = GetAllowed();
      +    if (x <= 0) {
      +      return false;
      +    } else {
      +      SetAllowed(x - 1);
      +      return true;
      +    }
      +  }
      +
      +  // Release a slot acquired by a previous call to Acquire() that returned true.
      +  void Release() {
      +    MutexLock l(&mu_);
      +    SetAllowed(GetAllowed() + 1);
      +  }
      +
      + private:
      +  port::Mutex mu_;
      +  port::AtomicPointer allowed_;
      +
      +  intptr_t GetAllowed() const {
      +    return reinterpret_cast(allowed_.Acquire_Load());
      +  }
      +
      +  // REQUIRES: mu_ must be held
      +  void SetAllowed(intptr_t v) {
      +    allowed_.Release_Store(reinterpret_cast(v));
      +  }
      +
      +  MmapLimiter(const MmapLimiter&);
      +  void operator=(const MmapLimiter&);
      +};
      +
       // mmap() based random-access
       class PosixMmapReadableFile: public RandomAccessFile {
        private:
         std::string filename_;
         void* mmapped_region_;
         size_t length_;
      +  MmapLimiter* limiter_;
       
        public:
         // base[0,length-1] contains the mmapped contents of the file.
      -  PosixMmapReadableFile(const std::string& fname, void* base, size_t length)
      -      : filename_(fname), mmapped_region_(base), length_(length) { }
      -  virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
      +  PosixMmapReadableFile(const std::string& fname, void* base, size_t length,
      +                        MmapLimiter* limiter)
      +      : filename_(fname), mmapped_region_(base), length_(length),
      +        limiter_(limiter) {
      +  }
      +
      +  virtual ~PosixMmapReadableFile() {
      +    munmap(mmapped_region_, length_);
      +    limiter_->Release();
      +  }
       
         virtual Status Read(uint64_t offset, size_t n, Slice* result,
                             char* scratch) const {
      @@ -300,6 +359,25 @@ static int LockOrUnlock(int fd, bool lock) {
       class PosixFileLock : public FileLock {
        public:
         int fd_;
      +  std::string name_;
      +};
      +
      +// Set of locked files.  We keep a separate set instead of just
      +// relying on fcntrl(F_SETLK) since fcntl(F_SETLK) does not provide
      +// any protection against multiple uses from the same process.
      +class PosixLockTable {
      + private:
      +  port::Mutex mu_;
      +  std::set locked_files_;
      + public:
      +  bool Insert(const std::string& fname) {
      +    MutexLock l(&mu_);
      +    return locked_files_.insert(fname).second;
      +  }
      +  void Remove(const std::string& fname) {
      +    MutexLock l(&mu_);
      +    locked_files_.erase(fname);
      +  }
       };
       
       class PosixEnv : public Env {
      @@ -329,19 +407,21 @@ class PosixEnv : public Env {
           int fd = open(fname.c_str(), O_RDONLY);
           if (fd < 0) {
             s = IOError(fname, errno);
      -    } else if (sizeof(void*) >= 8) {
      -      // Use mmap when virtual address-space is plentiful.
      +    } else if (mmap_limit_.Acquire()) {
             uint64_t size;
             s = GetFileSize(fname, &size);
             if (s.ok()) {
               void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
               if (base != MAP_FAILED) {
      -          *result = new PosixMmapReadableFile(fname, base, size);
      +          *result = new PosixMmapReadableFile(fname, base, size, &mmap_limit_);
               } else {
                 s = IOError(fname, errno);
               }
             }
             close(fd);
      +      if (!s.ok()) {
      +        mmap_limit_.Release();
      +      }
           } else {
             *result = new PosixRandomAccessFile(fname, fd);
           }
      @@ -430,12 +510,17 @@ class PosixEnv : public Env {
           int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
           if (fd < 0) {
             result = IOError(fname, errno);
      +    } else if (!locks_.Insert(fname)) {
      +      close(fd);
      +      result = Status::IOError("lock " + fname, "already held by process");
           } else if (LockOrUnlock(fd, true) == -1) {
             result = IOError("lock " + fname, errno);
             close(fd);
      +      locks_.Remove(fname);
           } else {
             PosixFileLock* my_lock = new PosixFileLock;
             my_lock->fd_ = fd;
      +      my_lock->name_ = fname;
             *lock = my_lock;
           }
           return result;
      @@ -447,6 +532,7 @@ class PosixEnv : public Env {
           if (LockOrUnlock(my_lock->fd_, false) == -1) {
             result = IOError("unlock", errno);
           }
      +    locks_.Remove(my_lock->name_);
           close(my_lock->fd_);
           delete my_lock;
           return result;
      @@ -523,6 +609,9 @@ class PosixEnv : public Env {
         struct BGItem { void* arg; void (*function)(void*); };
         typedef std::deque BGQueue;
         BGQueue queue_;
      +
      +  PosixLockTable locks_;
      +  MmapLimiter mmap_limit_;
       };
       
       PosixEnv::PosixEnv() : page_size_(getpagesize()),
      diff --git a/util/mutexlock.h b/util/mutexlock.h
      index c3f3306..1ff5a9e 100644
      --- a/util/mutexlock.h
      +++ b/util/mutexlock.h
      @@ -6,6 +6,7 @@
       #define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_
       
       #include "port/port.h"
      +#include "port/thread_annotations.h"
       
       namespace leveldb {
       
      @@ -19,12 +20,13 @@ namespace leveldb {
       //     ... some complex code, possibly with multiple return paths ...
       //   }
       
      -class MutexLock {
      +class SCOPED_LOCKABLE MutexLock {
        public:
      -  explicit MutexLock(port::Mutex *mu) : mu_(mu) {
      +  explicit MutexLock(port::Mutex *mu) EXCLUSIVE_LOCK_FUNCTION(mu)
      +      : mu_(mu)  {
           this->mu_->Lock();
         }
      -  ~MutexLock() { this->mu_->Unlock(); }
      +  ~MutexLock() UNLOCK_FUNCTION() { this->mu_->Unlock(); }
       
        private:
         port::Mutex *const mu_;
      -- 
      cgit v1.2.3