aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordgrogan@chromium.org <dgrogan@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>2011-04-12 19:38:58 +0000
committerdgrogan@chromium.org <dgrogan@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>2011-04-12 19:38:58 +0000
commit95e21f32367748825123e382172ecbfd492ddb23 (patch)
treeca48db37de2515489323a99272729a1d9f5be254
parent1ca60b12c68a71aac695b15e329b2a76a63cbb0a (diff)
downloadsrc-95e21f32367748825123e382172ecbfd492ddb23.tar.gz
@20602303. Default file permission is now 755.
git-svn-id: http://leveldb.googlecode.com/svn/trunk@20 62dab493-f737-651d-591e-8d6aee1b9529
-rwxr-xr-x[-rw-r--r--]AUTHORS0
-rwxr-xr-x[-rw-r--r--]LICENSE0
-rwxr-xr-x[-rw-r--r--]Makefile0
-rwxr-xr-x[-rw-r--r--]README0
-rwxr-xr-x[-rw-r--r--]TODO8
-rwxr-xr-x[-rw-r--r--]db/builder.cc0
-rwxr-xr-x[-rw-r--r--]db/builder.h0
-rwxr-xr-x[-rw-r--r--]db/corruption_test.cc21
-rwxr-xr-x[-rw-r--r--]db/db_bench.cc180
-rwxr-xr-x[-rw-r--r--]db/db_impl.cc270
-rwxr-xr-x[-rw-r--r--]db/db_impl.h35
-rwxr-xr-x[-rw-r--r--]db/db_iter.cc0
-rwxr-xr-x[-rw-r--r--]db/db_iter.h0
-rwxr-xr-x[-rw-r--r--]db/db_test.cc27
-rwxr-xr-x[-rw-r--r--]db/dbformat.cc0
-rwxr-xr-x[-rw-r--r--]db/dbformat.h6
-rwxr-xr-x[-rw-r--r--]db/dbformat_test.cc0
-rwxr-xr-x[-rw-r--r--]db/filename.cc0
-rwxr-xr-x[-rw-r--r--]db/filename.h0
-rwxr-xr-x[-rw-r--r--]db/filename_test.cc0
-rwxr-xr-x[-rw-r--r--]db/log_format.h0
-rwxr-xr-x[-rw-r--r--]db/log_reader.cc0
-rwxr-xr-x[-rw-r--r--]db/log_reader.h0
-rwxr-xr-x[-rw-r--r--]db/log_test.cc0
-rwxr-xr-x[-rw-r--r--]db/log_writer.cc0
-rwxr-xr-x[-rw-r--r--]db/log_writer.h0
-rwxr-xr-x[-rw-r--r--]db/memtable.cc0
-rwxr-xr-x[-rw-r--r--]db/memtable.h0
-rwxr-xr-x[-rw-r--r--]db/repair.cc0
-rwxr-xr-x[-rw-r--r--]db/skiplist.h0
-rwxr-xr-x[-rw-r--r--]db/skiplist_test.cc0
-rwxr-xr-x[-rw-r--r--]db/snapshot.h0
-rwxr-xr-x[-rw-r--r--]db/table_cache.cc0
-rwxr-xr-x[-rw-r--r--]db/table_cache.h0
-rwxr-xr-x[-rw-r--r--]db/version_edit.cc19
-rwxr-xr-x[-rw-r--r--]db/version_edit.h6
-rwxr-xr-x[-rw-r--r--]db/version_edit_test.cc0
-rwxr-xr-x[-rw-r--r--]db/version_set.cc116
-rwxr-xr-x[-rw-r--r--]db/version_set.h35
-rwxr-xr-x[-rw-r--r--]db/write_batch.cc0
-rwxr-xr-x[-rw-r--r--]db/write_batch_internal.h0
-rwxr-xr-x[-rw-r--r--]db/write_batch_test.cc0
-rwxr-xr-x[-rw-r--r--]doc/doc.css0
-rwxr-xr-x[-rw-r--r--]doc/impl.html0
-rwxr-xr-x[-rw-r--r--]doc/index.html82
-rwxr-xr-x[-rw-r--r--]doc/log_format.txt0
-rwxr-xr-x[-rw-r--r--]doc/table_format.txt0
-rwxr-xr-x[-rw-r--r--]include/leveldb/cache.h0
-rwxr-xr-x[-rw-r--r--]include/leveldb/comparator.h0
-rwxr-xr-x[-rw-r--r--]include/leveldb/db.h12
-rwxr-xr-x[-rw-r--r--]include/leveldb/env.h0
-rwxr-xr-x[-rw-r--r--]include/leveldb/iterator.h0
-rwxr-xr-x[-rw-r--r--]include/leveldb/options.h27
-rwxr-xr-x[-rw-r--r--]include/leveldb/slice.h0
-rwxr-xr-x[-rw-r--r--]include/leveldb/status.h0
-rwxr-xr-x[-rw-r--r--]include/leveldb/table.h0
-rwxr-xr-x[-rw-r--r--]include/leveldb/table_builder.h0
-rwxr-xr-x[-rw-r--r--]include/leveldb/write_batch.h0
-rwxr-xr-x[-rw-r--r--]leveldb.gyp0
-rwxr-xr-x[-rw-r--r--]port/README0
-rwxr-xr-x[-rw-r--r--]port/port.h0
-rwxr-xr-x[-rw-r--r--]port/port_android.cc1
-rwxr-xr-x[-rw-r--r--]port/port_android.h50
-rwxr-xr-x[-rw-r--r--]port/port_chromium.cc0
-rwxr-xr-x[-rw-r--r--]port/port_chromium.h0
-rwxr-xr-x[-rw-r--r--]port/port_example.h0
-rwxr-xr-x[-rw-r--r--]port/port_posix.cc0
-rwxr-xr-x[-rw-r--r--]port/port_posix.h0
-rwxr-xr-x[-rw-r--r--]port/sha1_portable.cc0
-rwxr-xr-x[-rw-r--r--]port/sha1_portable.h0
-rwxr-xr-x[-rw-r--r--]port/sha1_test.cc0
-rwxr-xr-x[-rw-r--r--]port/win/stdint.h0
-rwxr-xr-x[-rw-r--r--]table/block.cc0
-rwxr-xr-x[-rw-r--r--]table/block.h0
-rwxr-xr-x[-rw-r--r--]table/block_builder.cc0
-rwxr-xr-x[-rw-r--r--]table/block_builder.h0
-rwxr-xr-x[-rw-r--r--]table/format.cc0
-rwxr-xr-x[-rw-r--r--]table/format.h0
-rwxr-xr-x[-rw-r--r--]table/iterator.cc0
-rwxr-xr-x[-rw-r--r--]table/iterator_wrapper.h0
-rwxr-xr-x[-rw-r--r--]table/merger.cc0
-rwxr-xr-x[-rw-r--r--]table/merger.h0
-rwxr-xr-x[-rw-r--r--]table/table.cc0
-rwxr-xr-x[-rw-r--r--]table/table_builder.cc0
-rwxr-xr-x[-rw-r--r--]table/table_test.cc4
-rwxr-xr-x[-rw-r--r--]table/two_level_iterator.cc0
-rwxr-xr-x[-rw-r--r--]table/two_level_iterator.h0
-rwxr-xr-x[-rw-r--r--]util/arena.cc0
-rwxr-xr-x[-rw-r--r--]util/arena.h0
-rwxr-xr-x[-rw-r--r--]util/arena_test.cc0
-rwxr-xr-x[-rw-r--r--]util/cache.cc0
-rwxr-xr-x[-rw-r--r--]util/cache_test.cc0
-rwxr-xr-x[-rw-r--r--]util/coding.cc0
-rwxr-xr-x[-rw-r--r--]util/coding.h0
-rwxr-xr-x[-rw-r--r--]util/coding_test.cc0
-rwxr-xr-x[-rw-r--r--]util/comparator.cc0
-rwxr-xr-x[-rw-r--r--]util/crc32c.cc0
-rwxr-xr-x[-rw-r--r--]util/crc32c.h0
-rwxr-xr-x[-rw-r--r--]util/crc32c_test.cc0
-rwxr-xr-x[-rw-r--r--]util/env.cc0
-rwxr-xr-x[-rw-r--r--]util/env_chromium.cc0
-rwxr-xr-x[-rw-r--r--]util/env_posix.cc0
-rwxr-xr-x[-rw-r--r--]util/env_test.cc0
-rwxr-xr-x[-rw-r--r--]util/hash.cc0
-rwxr-xr-x[-rw-r--r--]util/hash.h0
-rwxr-xr-x[-rw-r--r--]util/histogram.cc0
-rwxr-xr-x[-rw-r--r--]util/histogram.h0
-rwxr-xr-x[-rw-r--r--]util/logging.cc0
-rwxr-xr-x[-rw-r--r--]util/logging.h0
-rwxr-xr-x[-rw-r--r--]util/mutexlock.h0
-rwxr-xr-x[-rw-r--r--]util/options.cc4
-rwxr-xr-x[-rw-r--r--]util/random.h0
-rwxr-xr-x[-rw-r--r--]util/status.cc0
-rwxr-xr-x[-rw-r--r--]util/testharness.cc0
-rwxr-xr-x[-rw-r--r--]util/testharness.h0
-rwxr-xr-x[-rw-r--r--]util/testutil.cc0
-rwxr-xr-x[-rw-r--r--]util/testutil.h0
117 files changed, 628 insertions, 275 deletions
diff --git a/AUTHORS b/AUTHORS
index 27a9407..27a9407 100644..100755
--- a/AUTHORS
+++ b/AUTHORS
diff --git a/LICENSE b/LICENSE
index 8e80208..8e80208 100644..100755
--- a/LICENSE
+++ b/LICENSE
diff --git a/Makefile b/Makefile
index 7569701..7569701 100644..100755
--- a/Makefile
+++ b/Makefile
diff --git a/README b/README
index c97e43c..c97e43c 100644..100755
--- a/README
+++ b/README
diff --git a/TODO b/TODO
index e17dfdb..2f848b8 100644..100755
--- a/TODO
+++ b/TODO
@@ -1,11 +1,3 @@
-Before adding to chrome
------------------------
-- multi-threaded test/benchmark
-- Allow missing crc32c in Table format?
-
-Maybe afterwards
-----------------
-
ss
- Stats
diff --git a/db/builder.cc b/db/builder.cc
index 6c8e6b8..6c8e6b8 100644..100755
--- a/db/builder.cc
+++ b/db/builder.cc
diff --git a/db/builder.h b/db/builder.h
index 4efcb04..4efcb04 100644..100755
--- a/db/builder.h
+++ b/db/builder.h
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index de9408c..63d8d8b 100644..100755
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -8,6 +8,7 @@
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
+#include "leveldb/cache.h"
#include "leveldb/env.h"
#include "leveldb/table.h"
#include "leveldb/write_batch.h"
@@ -28,10 +29,12 @@ class CorruptionTest {
test::ErrorEnv env_;
Random rnd_;
std::string dbname_;
+ Cache* tiny_cache_;
Options options_;
DB* db_;
CorruptionTest() : rnd_(test::RandomSeed()) {
+ tiny_cache_ = NewLRUCache(100);
options_.env = &env_;
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, options_);
@@ -45,6 +48,7 @@ class CorruptionTest {
~CorruptionTest() {
delete db_;
DestroyDB(dbname_, Options());
+ delete tiny_cache_;
}
Status TryReopen(Options* options = NULL) {
@@ -52,6 +56,7 @@ class CorruptionTest {
db_ = NULL;
Options opt = (options ? *options : options_);
opt.env = &env_;
+ opt.block_cache = tiny_cache_;
return DB::Open(opt, dbname_, &db_);
}
@@ -160,12 +165,15 @@ class CorruptionTest {
ASSERT_TRUE(s.ok()) << s.ToString();
}
- uint64_t Property(const std::string& name) {
- uint64_t result;
- if (!db_->GetProperty(name, &result)) {
- result = ~static_cast<uint64_t>(0);
+ int Property(const std::string& name) {
+ std::string property;
+ int result;
+ if (db_->GetProperty(name, &property) &&
+ sscanf(property.c_str(), "%d", &result) == 1) {
+ return result;
+ } else {
+ return -1;
}
- return result;
}
// Return the ith key
@@ -235,7 +243,7 @@ TEST(CorruptionTest, TableFileIndexData) {
dbi->TEST_CompactRange(0, "", "~");
dbi->TEST_CompactRange(1, "", "~");
- Corrupt(kTableFile, -1000, 500);
+ Corrupt(kTableFile, -2000, 500);
Reopen();
Check(5000, 9999);
}
@@ -327,6 +335,7 @@ TEST(CorruptionTest, CompactionInputError) {
TEST(CorruptionTest, CompactionInputErrorParanoid) {
Options options;
options.paranoid_checks = true;
+ options.write_buffer_size = 1048576;
Reopen(&options);
Build(10);
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 411493c..849ebfa 100644..100755
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -31,11 +31,8 @@
// sha1 -- repeated SHA1 computation over 4K of data
// Meta operations:
// compact -- Compact the entire DB
+// stats -- Print DB stats
// heapprofile -- Dump a heap profile (if supported by this port)
-// sync -- switch to synchronous writes (not the default)
-// nosync -- switch to asynchronous writes (the default)
-// tenth -- divide N by 10 (i.e., following benchmarks are smaller)
-// normal -- reset N back to its normal value (1000000)
static const char* FLAGS_benchmarks =
"fillseq,"
"fillsync,"
@@ -51,7 +48,9 @@ static const char* FLAGS_benchmarks =
"readreverse,"
"fill100K,"
"crc32c,"
- "sha1"
+ "sha1,"
+ "snappycomp,"
+ "snappyuncomp,"
;
// Number of key/values to place in database
@@ -68,7 +67,12 @@ static double FLAGS_compression_ratio = 0.5;
static bool FLAGS_histogram = false;
// Number of bytes to buffer in memtable before compacting
-static int FLAGS_write_buffer_size = 1 << 20;
+// (initialized to default value by "main")
+static int FLAGS_write_buffer_size = 0;
+
+// Number of bytes to use as a cache of uncompressed data.
+// Negative means use default settings.
+static int FLAGS_cache_size = -1;
namespace leveldb {
@@ -129,6 +133,7 @@ class Benchmark {
double last_op_finish_;
int64_t bytes_;
std::string message_;
+ std::string post_message_;
Histogram hist_;
RandomGenerator gen_;
Random rand_;
@@ -146,7 +151,8 @@ class Benchmark {
static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
fprintf(stdout, "Entries: %d\n", num_);
fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
- (((kKeySize + FLAGS_value_size) * num_) / 1048576.0));
+ ((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
+ / 1048576.0));
fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
(((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
/ 1048576.0));
@@ -164,6 +170,15 @@ class Benchmark {
fprintf(stdout,
"WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
+
+ // See if snappy is working by attempting to compress a compressible string
+ const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy";
+ std::string compressed;
+ if (!port::Snappy_Compress(text, sizeof(text), &compressed)) {
+ fprintf(stdout, "WARNING: Snappy compression is not enabled\n");
+ } else if (compressed.size() >= sizeof(text)) {
+ fprintf(stdout, "WARNING: Snappy compression is not effective\n");
+ }
}
void PrintEnvironment() {
@@ -225,15 +240,13 @@ class Benchmark {
done_++;
if (done_ >= next_report_) {
- if (next_report_ < 1000) {
- next_report_ += 100;
- } else if (next_report_ < 10000) {
- next_report_ += 1000;
- } else if (next_report_ < 100000) {
- next_report_ += 10000;
- } else {
- next_report_ += 100000;
- }
+ if (next_report_ < 1000) next_report_ += 100;
+ else if (next_report_ < 5000) next_report_ += 500;
+ else if (next_report_ < 10000) next_report_ += 1000;
+ else if (next_report_ < 50000) next_report_ += 5000;
+ else if (next_report_ < 100000) next_report_ += 10000;
+ else if (next_report_ < 500000) next_report_ += 50000;
+ else next_report_ += 100000;
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
@@ -248,7 +261,7 @@ class Benchmark {
if (bytes_ > 0) {
char rate[100];
- snprintf(rate, sizeof(rate), "%5.1f MB/s",
+ snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / (finish - start_));
if (!message_.empty()) {
message_ = std::string(rate) + " " + message_;
@@ -266,6 +279,11 @@ class Benchmark {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
+
+ if (!post_message_.empty()) {
+ fprintf(stdout, "\n%s\n", post_message_.c_str());
+ post_message_.clear();
+ }
}
public:
@@ -278,12 +296,13 @@ class Benchmark {
EXISTING
};
- Benchmark() : cache_(NewLRUCache(200<<20)),
- db_(NULL),
- num_(FLAGS_num),
- heap_counter_(0),
- bytes_(0),
- rand_(301) {
+ Benchmark()
+ : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
+ db_(NULL),
+ num_(FLAGS_num),
+ heap_counter_(0),
+ bytes_(0),
+ rand_(301) {
std::vector<std::string> files;
Env::Default()->GetChildren("/tmp/dbbench", &files);
for (int i = 0; i < files.size(); i++) {
@@ -318,36 +337,54 @@ class Benchmark {
Start();
WriteOptions write_options;
- write_options.sync = false;
+ bool known = true;
if (name == Slice("fillseq")) {
- Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size);
+ Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
+ } else if (name == Slice("fillbatch")) {
+ Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000);
} else if (name == Slice("fillrandom")) {
- Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size);
+ Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size, 1);
} else if (name == Slice("overwrite")) {
- Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size);
+ Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size, 1);
} else if (name == Slice("fillsync")) {
write_options.sync = true;
- Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size);
+ Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1);
} else if (name == Slice("fill100K")) {
- Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000);
+ Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1);
} else if (name == Slice("readseq")) {
ReadSequential();
} else if (name == Slice("readreverse")) {
ReadReverse();
} else if (name == Slice("readrandom")) {
ReadRandom();
+ } else if (name == Slice("readrandomsmall")) {
+ int n = num_;
+ num_ /= 1000;
+ ReadRandom();
+ num_ = n;
} else if (name == Slice("compact")) {
Compact();
} else if (name == Slice("crc32c")) {
Crc32c(4096, "(4K per op)");
} else if (name == Slice("sha1")) {
SHA1(4096, "(4K per op)");
+ } else if (name == Slice("snappycomp")) {
+ SnappyCompress();
+ } else if (name == Slice("snappyuncomp")) {
+ SnappyUncompress();
} else if (name == Slice("heapprofile")) {
HeapProfile();
+ } else if (name == Slice("stats")) {
+ PrintStats();
} else {
- fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+ known = false;
+ if (name != Slice()) { // No error message for empty name
+ fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+ }
+ }
+ if (known) {
+ Stop(name);
}
- Stop(name);
}
}
@@ -387,11 +424,54 @@ class Benchmark {
message_ = label;
}
+ void SnappyCompress() {
+ Slice input = gen_.Generate(Options().block_size);
+ int64_t bytes = 0;
+ int64_t produced = 0;
+ bool ok = true;
+ std::string compressed;
+ while (ok && bytes < 1024 * 1048576) { // Compress 1G
+ ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
+ produced += compressed.size();
+ bytes += input.size();
+ FinishedSingleOp();
+ }
+
+ if (!ok) {
+ message_ = "(snappy failure)";
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "(output: %.1f%%)",
+ (produced * 100.0) / bytes);
+ message_ = buf;
+ bytes_ = bytes;
+ }
+ }
+
+ void SnappyUncompress() {
+ Slice input = gen_.Generate(Options().block_size);
+ std::string compressed;
+ bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
+ int64_t bytes = 0;
+ std::string uncompressed;
+ while (ok && bytes < 1024 * 1048576) { // Compress 1G
+ ok = port::Snappy_Uncompress(compressed.data(), compressed.size(),
+ &uncompressed);
+ bytes += uncompressed.size();
+ FinishedSingleOp();
+ }
+
+ if (!ok) {
+ message_ = "(snappy failure)";
+ } else {
+ bytes_ = bytes;
+ }
+ }
+
void Open() {
assert(db_ == NULL);
Options options;
options.create_if_missing = true;
- options.max_open_files = 10000;
options.block_cache = cache_;
options.write_buffer_size = FLAGS_write_buffer_size;
Status s = DB::Open(options, "/tmp/dbbench", &db_);
@@ -402,7 +482,7 @@ class Benchmark {
}
void Write(const WriteOptions& options, Order order, DBState state,
- int num_entries, int value_size) {
+ int num_entries, int value_size, int entries_per_batch) {
if (state == FRESH) {
delete db_;
db_ = NULL;
@@ -420,19 +500,21 @@ class Benchmark {
WriteBatch batch;
Status s;
std::string val;
- for (int i = 0; i < num_entries; i++) {
- const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
- char key[100];
- snprintf(key, sizeof(key), "%016d", k);
+ for (int i = 0; i < num_entries; i += entries_per_batch) {
batch.Clear();
- batch.Put(key, gen_.Generate(value_size));
+ for (int j = 0; j < entries_per_batch; j++) {
+ const int k = (order == SEQUENTIAL) ? i+j : (rand_.Next() % FLAGS_num);
+ char key[100];
+ snprintf(key, sizeof(key), "%016d", k);
+ batch.Put(key, gen_.Generate(value_size));
+ bytes_ += value_size + strlen(key);
+ FinishedSingleOp();
+ }
s = db_->Write(options, &batch);
- bytes_ += value_size + strlen(key);
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
- FinishedSingleOp();
}
}
@@ -475,10 +557,10 @@ class Benchmark {
dbi->TEST_CompactMemTable();
int max_level_with_files = 1;
for (int level = 1; level < config::kNumLevels; level++) {
- uint64_t v;
+ std::string property;
char name[100];
snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
- if (db_->GetProperty(name, &v) && v > 0) {
+ if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) {
max_level_with_files = level;
}
}
@@ -487,6 +569,15 @@ class Benchmark {
}
}
+ void PrintStats() {
+ std::string stats;
+ if (!db_->GetProperty("leveldb.stats", &stats)) {
+ message_ = "(failed)";
+ } else {
+ post_message_ = stats;
+ }
+ }
+
static void WriteToFile(void* arg, const char* buf, int n) {
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
}
@@ -512,6 +603,7 @@ class Benchmark {
}
int main(int argc, char** argv) {
+ FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
for (int i = 1; i < argc; i++) {
double d;
int n;
@@ -529,7 +621,9 @@ int main(int argc, char** argv) {
FLAGS_value_size = n;
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n;
- } else {
+ } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
+ FLAGS_cache_size = n;
+ } else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);
}
diff --git a/db/db_impl.cc b/db/db_impl.cc
index cf5471b..d012236 100644..100755
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -104,6 +104,9 @@ Options SanitizeOptions(const std::string& dbname,
result.info_log = new NullWritableFile;
}
}
+ if (result.block_cache == NULL) {
+ result.block_cache = NewLRUCache(8 << 20);
+ }
return result;
}
@@ -112,18 +115,20 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
internal_comparator_(options.comparator),
options_(SanitizeOptions(dbname, &internal_comparator_, options)),
owns_info_log_(options_.info_log != options.info_log),
+ owns_cache_(options_.block_cache != options.block_cache),
dbname_(dbname),
db_lock_(NULL),
shutting_down_(NULL),
bg_cv_(&mutex_),
compacting_cv_(&mutex_),
- last_sequence_(0),
mem_(new MemTable(internal_comparator_)),
+ imm_(NULL),
logfile_(NULL),
log_(NULL),
- log_number_(0),
bg_compaction_scheduled_(false),
compacting_(false) {
+ has_imm_.Release_Store(NULL);
+
// Reserve ten files or so for other uses and give the rest to TableCache.
const int table_cache_size = options.max_open_files - 10;
table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
@@ -149,6 +154,7 @@ DBImpl::~DBImpl() {
delete versions_;
delete mem_;
+ delete imm_;
delete log_;
delete logfile_;
delete table_cache_;
@@ -156,15 +162,15 @@ DBImpl::~DBImpl() {
if (owns_info_log_) {
delete options_.info_log;
}
+ if (owns_cache_) {
+ delete options_.block_cache;
+ }
}
Status DBImpl::NewDB() {
- assert(log_number_ == 0);
- assert(last_sequence_ == 0);
-
VersionEdit new_db;
new_db.SetComparatorName(user_comparator()->Name());
- new_db.SetLogNumber(log_number_);
+ new_db.SetLogNumber(0);
new_db.SetNextFile(2);
new_db.SetLastSequence(0);
@@ -193,15 +199,6 @@ Status DBImpl::NewDB() {
return s;
}
-Status DBImpl::Install(VersionEdit* edit,
- uint64_t new_log_number,
- MemTable* cleanup_mem) {
- mutex_.AssertHeld();
- edit->SetLogNumber(new_log_number);
- edit->SetLastSequence(last_sequence_);
- return versions_->LogAndApply(edit, cleanup_mem);
-}
-
void DBImpl::MaybeIgnoreError(Status* s) const {
if (s->ok() || options_.paranoid_checks) {
// No change needed
@@ -216,7 +213,7 @@ void DBImpl::DeleteObsoleteFiles() {
std::set<uint64_t> live = pending_outputs_;
versions_->AddLiveFiles(&live);
- versions_->CleanupLargeValueRefs(live, log_number_);
+ versions_->CleanupLargeValueRefs(live);
std::vector<std::string> filenames;
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
@@ -228,7 +225,8 @@ void DBImpl::DeleteObsoleteFiles() {
bool keep = true;
switch (type) {
case kLogFile:
- keep = (number == log_number_);
+ keep = ((number == versions_->LogNumber()) ||
+ (number == versions_->PrevLogNumber()));
break;
case kDescriptorFile:
// Keep my manifest file, and any newer incarnations'
@@ -296,16 +294,20 @@ Status DBImpl::Recover(VersionEdit* edit) {
}
}
- s = versions_->Recover(&log_number_, &last_sequence_);
+ s = versions_->Recover();
if (s.ok()) {
- // Recover from the log file named in the descriptor
+ // Recover from the log files named in the descriptor
SequenceNumber max_sequence(0);
- if (log_number_ != 0) { // log_number_ == 0 indicates initial empty state
- s = RecoverLogFile(log_number_, edit, &max_sequence);
+ if (versions_->PrevLogNumber() != 0) { // log#==0 means no prev log
+ s = RecoverLogFile(versions_->PrevLogNumber(), edit, &max_sequence);
+ }
+ if (s.ok() && versions_->LogNumber() != 0) { // log#==0 for initial state
+ s = RecoverLogFile(versions_->LogNumber(), edit, &max_sequence);
}
if (s.ok()) {
- last_sequence_ =
- last_sequence_ > max_sequence ? last_sequence_ : max_sequence;
+ if (versions_->LastSequence() < max_sequence) {
+ versions_->SetLastSequence(max_sequence);
+ }
}
}
@@ -407,56 +409,58 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) {
mutex_.AssertHeld();
+ const uint64_t start_micros = env_->NowMicros();
FileMetaData meta;
meta.number = versions_->NewFileNumber();
pending_outputs_.insert(meta.number);
Iterator* iter = mem->NewIterator();
Log(env_, options_.info_log, "Level-0 table #%llu: started",
(unsigned long long) meta.number);
- Status s = BuildTable(dbname_, env_, options_, table_cache_,
- iter, &meta, edit);
+
+ Status s;
+ {
+ mutex_.Unlock();
+ s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta, edit);
+ mutex_.Lock();
+ }
+
Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s",
(unsigned long long) meta.number,
(unsigned long long) meta.file_size,
s.ToString().c_str());
delete iter;
pending_outputs_.erase(meta.number);
+
+ CompactionStats stats;
+ stats.micros = env_->NowMicros() - start_micros;
+ stats.bytes_written = meta.file_size;
+ stats_[0].Add(stats);
return s;
}
Status DBImpl::CompactMemTable() {
mutex_.AssertHeld();
-
- WritableFile* lfile = NULL;
- uint64_t new_log_number = versions_->NewFileNumber();
-
- VersionEdit edit;
+ assert(imm_ != NULL);
+ assert(compacting_);
// Save the contents of the memtable as a new Table
- Status s = WriteLevel0Table(mem_, &edit);
- if (s.ok()) {
- s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
- }
+ VersionEdit edit;
+ Status s = WriteLevel0Table(imm_, &edit);
- // Save a new descriptor with the new table and log number.
+ // Replace immutable memtable with the generated Table
if (s.ok()) {
- s = Install(&edit, new_log_number, mem_);
+ edit.SetPrevLogNumber(0);
+ s = versions_->LogAndApply(&edit, imm_);
}
if (s.ok()) {
// Commit to the new state
- mem_ = new MemTable(internal_comparator_);
- delete log_;
- delete logfile_;
- logfile_ = lfile;
- log_ = new log::Writer(lfile);
- log_number_ = new_log_number;
+ imm_ = NULL;
+ has_imm_.Release_Store(NULL);
DeleteObsoleteFiles();
- MaybeScheduleCompaction();
- } else {
- delete lfile;
- env_->DeleteFile(LogFileName(dbname_, new_log_number));
}
+
+ compacting_cv_.SignalAll(); // Wake up waiter even if there was an error
return s;
}
@@ -485,7 +489,17 @@ void DBImpl::TEST_CompactRange(
Status DBImpl::TEST_CompactMemTable() {
MutexLock l(&mutex_);
- return CompactMemTable();
+ Status s = MakeRoomForWrite(true /* force compaction */);
+ if (s.ok()) {
+ // Wait until the compaction completes
+ while (imm_ != NULL && bg_error_.ok()) {
+ compacting_cv_.Wait();
+ }
+ if (imm_ != NULL) {
+ s = bg_error_;
+ }
+ }
+ return s;
}
void DBImpl::MaybeScheduleCompaction() {
@@ -496,7 +510,7 @@ void DBImpl::MaybeScheduleCompaction() {
// Some other thread is running a compaction. Do not conflict with it.
} else if (shutting_down_.Acquire_Load()) {
// DB is being deleted; no more background compactions
- } else if (!versions_->NeedsCompaction()) {
+ } else if (imm_ == NULL && !versions_->NeedsCompaction()) {
// No work to be done
} else {
bg_compaction_scheduled_ = true;
@@ -525,6 +539,16 @@ void DBImpl::BackgroundCall() {
void DBImpl::BackgroundCompaction() {
mutex_.AssertHeld();
+ assert(!compacting_);
+
+ if (imm_ != NULL) {
+ compacting_ = true;
+ CompactMemTable();
+ compacting_ = false;
+ compacting_cv_.SignalAll();
+ return;
+ }
+
Compaction* c = versions_->PickCompaction();
if (c == NULL) {
// Nothing to do
@@ -539,7 +563,7 @@ void DBImpl::BackgroundCompaction() {
c->edit()->DeleteFile(c->level(), f->number);
c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
f->smallest, f->largest);
- status = Install(c->edit(), log_number_, NULL);
+ status = versions_->LogAndApply(c->edit(), NULL);
Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
static_cast<unsigned long long>(f->number),
c->level() + 1,
@@ -680,7 +704,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
}
compact->outputs.clear();
- Status s = Install(compact->compaction->edit(), log_number_, NULL);
+ Status s = versions_->LogAndApply(compact->compaction->edit(), NULL);
if (s.ok()) {
compact->compaction->ReleaseInputs();
DeleteObsoleteFiles();
@@ -694,6 +718,9 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
}
Status DBImpl::DoCompactionWork(CompactionState* compact) {
+ const uint64_t start_micros = env_->NowMicros();
+ int64_t imm_micros = 0; // Micros spent doing imm_ compactions
+
Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files",
compact->compaction->num_input_files(0),
compact->compaction->level(),
@@ -704,7 +731,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
assert(compact->builder == NULL);
assert(compact->outfile == NULL);
if (snapshots_.empty()) {
- compact->smallest_snapshot = last_sequence_;
+ compact->smallest_snapshot = versions_->LastSequence();
} else {
compact->smallest_snapshot = snapshots_.oldest()->number_;
}
@@ -721,6 +748,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
bool has_current_user_key = false;
SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
+ // Prioritize immutable compaction work
+ if (has_imm_.NoBarrier_Load() != NULL) {
+ const uint64_t imm_start = env_->NowMicros();
+ mutex_.Lock();
+ if (imm_ != NULL) {
+ CompactMemTable();
+ compacting_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
+ }
+ mutex_.Unlock();
+ imm_micros += (env_->NowMicros() - imm_start);
+ }
+
Slice key = input->key();
InternalKey tmp_internal_key;
tmp_internal_key.DecodeFrom(key);
@@ -835,7 +874,19 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
delete input;
input = NULL;
+ CompactionStats stats;
+ stats.micros = env_->NowMicros() - start_micros - imm_micros;
+ for (int which = 0; which < 2; which++) {
+ for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
+ stats.bytes_read += compact->compaction->input(which, i)->file_size;
+ }
+ }
+ for (int i = 0; i < compact->outputs.size(); i++) {
+ stats.bytes_written += compact->outputs[i].file_size;
+ }
+
mutex_.Lock();
+ stats_[compact->compaction->level() + 1].Add(stats);
if (status.ok()) {
status = InstallCompactionResults(compact);
@@ -848,11 +899,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
SequenceNumber* latest_snapshot) {
mutex_.Lock();
- *latest_snapshot = last_sequence_;
+ *latest_snapshot = versions_->LastSequence();
// Collect together all needed child iterators
std::vector<Iterator*> list;
list.push_back(mem_->NewIterator());
+ if (imm_ != NULL) {
+ list.push_back(imm_->NewIterator());
+ }
versions_->current()->AddIterators(options, &list);
Iterator* internal_iter =
NewMergingIterator(&internal_comparator_, &list[0], list.size());
@@ -912,7 +966,7 @@ void DBImpl::Unref(void* arg1, void* arg2) {
const Snapshot* DBImpl::GetSnapshot() {
MutexLock l(&mutex_);
- return snapshots_.New(last_sequence_);
+ return snapshots_.New(versions_->LastSequence());
}
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
@@ -935,17 +989,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
WriteBatch* final = NULL;
{
MutexLock l(&mutex_);
- if (!bg_error_.ok()) {
- status = bg_error_;
- } else if (mem_->ApproximateMemoryUsage() > options_.write_buffer_size) {
- status = CompactMemTable();
- }
+ status = MakeRoomForWrite(false); // May temporarily release lock and wait
+
+ uint64_t last_sequence = versions_->LastSequence();
if (status.ok()) {
- status = HandleLargeValues(last_sequence_ + 1, updates, &final);
+ status = HandleLargeValues(last_sequence + 1, updates, &final);
}
if (status.ok()) {
- WriteBatchInternal::SetSequence(final, last_sequence_ + 1);
- last_sequence_ += WriteBatchInternal::Count(final);
+ WriteBatchInternal::SetSequence(final, last_sequence + 1);
+ last_sequence += WriteBatchInternal::Count(final);
+ versions_->SetLastSequence(last_sequence);
// Add to log and apply to memtable
status = log_->AddRecord(WriteBatchInternal::Contents(final));
@@ -959,7 +1012,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
if (options.post_write_snapshot != NULL) {
*options.post_write_snapshot =
- status.ok() ? snapshots_.New(last_sequence_) : NULL;
+ status.ok() ? snapshots_.New(last_sequence) : NULL;
}
}
if (final != updates) {
@@ -969,6 +1022,54 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
return status;
}
+Status DBImpl::MakeRoomForWrite(bool force) {
+ mutex_.AssertHeld();
+ Status s;
+ while (true) {
+ if (!bg_error_.ok()) {
+ // Yield previous error
+ s = bg_error_;
+ break;
+ } else if (!force &&
+ (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
+ // There is room in current memtable
+ break;
+ } else if (imm_ != NULL) {
+ // We have filled up the current memtable, but the previous
+ // one is still being compacted, so we wait.
+ compacting_cv_.Wait();
+ } else {
+ // Attempt to switch to a new memtable and trigger compaction of old
+ assert(versions_->PrevLogNumber() == 0);
+ uint64_t new_log_number = versions_->NewFileNumber();
+ WritableFile* lfile = NULL;
+ s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
+ if (!s.ok()) {
+ break;
+ }
+ VersionEdit edit;
+ edit.SetPrevLogNumber(versions_->LogNumber());
+ edit.SetLogNumber(new_log_number);
+ s = versions_->LogAndApply(&edit, NULL);
+ if (!s.ok()) {
+ delete lfile;
+ env_->DeleteFile(LogFileName(dbname_, new_log_number));
+ break;
+ }
+ delete log_;
+ delete logfile_;
+ logfile_ = lfile;
+ log_ = new log::Writer(lfile);
+ imm_ = mem_;
+ has_imm_.Release_Store(imm_);
+ mem_ = new MemTable(internal_comparator_);
+ force = false; // Do not force another compaction if have room
+ MaybeScheduleCompaction();
+ }
+ }
+ return s;
+}
+
bool DBImpl::HasLargeValues(const WriteBatch& batch) const {
if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) {
for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) {
@@ -1033,9 +1134,10 @@ Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq,
MaybeCompressLargeValue(
it.value(), &file_bytes, &scratch, &large_ref);
InternalKey ikey(it.key(), seq, kTypeLargeValueRef);
- if (versions_->RegisterLargeValueRef(large_ref, log_number_,ikey)) {
+ if (versions_->RegisterLargeValueRef(
+ large_ref, versions_->LogNumber(), ikey)) {
// TODO(opt): avoid holding the lock here (but be careful about
- // another thread doing a Write and changing log_number_ or
+ // another thread doing a Write and switching logs or
// having us get a different "assigned_seq" value).
uint64_t tmp_number = versions_->NewFileNumber();
@@ -1086,7 +1188,9 @@ Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq,
return Status::OK();
}
-bool DBImpl::GetProperty(const Slice& property, uint64_t* value) {
+bool DBImpl::GetProperty(const Slice& property, std::string* value) {
+ value->clear();
+
MutexLock l(&mutex_);
Slice in = property;
Slice prefix("leveldb.");
@@ -1100,10 +1204,37 @@ bool DBImpl::GetProperty(const Slice& property, uint64_t* value) {
if (!ok || level < 0 || level >= config::kNumLevels) {
return false;
} else {
- *value = versions_->NumLevelFiles(level);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", versions_->NumLevelFiles(level));
+ *value = buf;
return true;
}
+ } else if (in == "stats") {
+ char buf[200];
+ snprintf(buf, sizeof(buf),
+ " Compactions\n"
+ "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
+ "--------------------------------------------------\n"
+ );
+ value->append(buf);
+ for (int level = 0; level < config::kNumLevels; level++) {
+ int files = versions_->NumLevelFiles(level);
+ if (stats_[level].micros > 0 || files > 0) {
+ snprintf(
+ buf, sizeof(buf),
+ "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
+ level,
+ files,
+ versions_->NumLevelBytes(level) / 1048576.0,
+ stats_[level].micros / 1e6,
+ stats_[level].bytes_read / 1048576.0,
+ stats_[level].bytes_written / 1048576.0);
+ value->append(buf);
+ }
+ }
+ return true;
}
+
return false;
}
@@ -1158,14 +1289,15 @@ Status DB::Open(const Options& options, const std::string& dbname,
VersionEdit edit;
Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
if (s.ok()) {
- impl->log_number_ = impl->versions_->NewFileNumber();
+ uint64_t new_log_number = impl->versions_->NewFileNumber();
WritableFile* lfile;
- s = options.env->NewWritableFile(LogFileName(dbname, impl->log_number_),
+ s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
&lfile);
if (s.ok()) {
+ edit.SetLogNumber(new_log_number);
impl->logfile_ = lfile;
impl->log_ = new log::Writer(lfile);
- s = impl->Install(&edit, impl->log_number_, NULL);
+ s = impl->versions_->LogAndApply(&edit, NULL);
}
if (s.ok()) {
impl->DeleteObsoleteFiles();
diff --git a/db/db_impl.h b/db/db_impl.h
index 49ac37b..1f685f0 100644..100755
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -36,7 +36,7 @@ class DBImpl : public DB {
virtual Iterator* NewIterator(const ReadOptions&);
virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot);
- virtual bool GetProperty(const Slice& property, uint64_t* value);
+ virtual bool GetProperty(const Slice& property, std::string* value);
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
// Extra methods (for testing) that are not in the public DB interface
@@ -72,14 +72,6 @@ class DBImpl : public DB {
// be made to the descriptor are added to *edit.
Status Recover(VersionEdit* edit);
- // Apply the specified updates and save the resulting descriptor to
- // persistent storage. If cleanup_mem is non-NULL, arrange to
- // delete it when all existing snapshots have gone away iff Install()
- // returns OK.
- Status Install(VersionEdit* edit,
- uint64_t new_log_number,
- MemTable* cleanup_mem);
-
void MaybeIgnoreError(Status* s) const;
// Delete any unneeded files and stale in-memory entries.
@@ -99,6 +91,7 @@ class DBImpl : public DB {
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit);
+ Status MakeRoomForWrite(bool force /* compact even if there is room? */);
bool HasLargeValues(const WriteBatch& batch) const;
// Process data in "*updates" and return a status. "assigned_seq"
@@ -141,6 +134,7 @@ class DBImpl : public DB {
const InternalKeyComparator internal_comparator_;
const Options options_; // options_.comparator == &internal_comparator_
bool owns_info_log_;
+ bool owns_cache_;
const std::string dbname_;
// table_cache_ provides its own synchronization
@@ -152,13 +146,13 @@ class DBImpl : public DB {
// State below is protected by mutex_
port::Mutex mutex_;
port::AtomicPointer shutting_down_;
- port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_
+ port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_
port::CondVar compacting_cv_; // Signalled when !compacting_
- SequenceNumber last_sequence_;
MemTable* mem_;
+ MemTable* imm_; // Memtable being compacted
+ port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
WritableFile* logfile_;
log::Writer* log_;
- uint64_t log_number_;
SnapshotList snapshots_;
// Set of table files to protect from deletion because they are
@@ -176,6 +170,23 @@ class DBImpl : public DB {
// Have we encountered a background error in paranoid mode?
Status bg_error_;
+ // Per level compaction stats. stats_[level] stores the stats for
+ // compactions that produced data for the specified "level".
+ struct CompactionStats {
+ int64_t micros;
+ int64_t bytes_read;
+ int64_t bytes_written;
+
+ CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { }
+
+ void Add(const CompactionStats& c) {
+ this->micros += c.micros;
+ this->bytes_read += c.bytes_read;
+ this->bytes_written += c.bytes_written;
+ }
+ };
+ CompactionStats stats_[config::kNumLevels];
+
// No copying allowed
DBImpl(const DBImpl&);
void operator=(const DBImpl&);
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 31c2a38..31c2a38 100644..100755
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
diff --git a/db/db_iter.h b/db/db_iter.h
index 195f3d3..195f3d3 100644..100755
--- a/db/db_iter.h
+++ b/db/db_iter.h
diff --git a/db/db_test.cc b/db/db_test.cc
index f68e759..04de331 100644..100755
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -72,19 +72,11 @@ class DBTest {
}
Status Put(const std::string& k, const std::string& v) {
- WriteOptions options;
- options.sync = false;
- WriteBatch batch;
- batch.Put(k, v);
- return db_->Write(options, &batch);
+ return db_->Put(WriteOptions(), k, v);
}
Status Delete(const std::string& k) {
- WriteOptions options;
- options.sync = false;
- WriteBatch batch;
- batch.Delete(k);
- return db_->Write(options, &batch);
+ return db_->Delete(WriteOptions(), k);
}
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
@@ -147,11 +139,11 @@ class DBTest {
}
int NumTableFilesAtLevel(int level) {
- uint64_t val;
+ std::string property;
ASSERT_TRUE(
db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
- &val));
- return val;
+ &property));
+ return atoi(property.c_str());
}
uint64_t Size(const Slice& start, const Slice& limit) {
@@ -185,10 +177,7 @@ class DBTest {
dbfull()->TEST_CompactMemTable();
int max_level_with_files = 1;
for (int level = 1; level < config::kNumLevels; level++) {
- uint64_t v;
- char name[100];
- snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
- if (dbfull()->GetProperty(name, &v) && v > 0) {
+ if (NumTableFilesAtLevel(level) > 0) {
max_level_with_files = level;
}
}
@@ -459,7 +448,7 @@ TEST(DBTest, MinorCompactionsHappen) {
options.write_buffer_size = 10000;
Reopen(&options);
- const int N = 100;
+ const int N = 500;
int starting_num_tables = NumTableFilesAtLevel(0);
for (int i = 0; i < N; i++) {
@@ -1047,7 +1036,7 @@ class ModelDB: public DB {
return Status::OK();
}
- virtual bool GetProperty(const Slice& property, uint64_t* value) {
+ virtual bool GetProperty(const Slice& property, std::string* value) {
return false;
}
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
diff --git a/db/dbformat.cc b/db/dbformat.cc
index 2664eb4..2664eb4 100644..100755
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
diff --git a/db/dbformat.h b/db/dbformat.h
index 6f34cd1..5f117f9 100644..100755
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -15,6 +15,12 @@
namespace leveldb {
+// Grouping of constants. We may want to make some of these
+// parameters set via options.
+namespace config {
+static const int kNumLevels = 7;
+}
+
class InternalKey;
// Value types encoded as the last component of internal keys.
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
index 702cbb4..702cbb4 100644..100755
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
diff --git a/db/filename.cc b/db/filename.cc
index d21918c..d21918c 100644..100755
--- a/db/filename.cc
+++ b/db/filename.cc
diff --git a/db/filename.h b/db/filename.h
index 81ab2fc..81ab2fc 100644..100755
--- a/db/filename.h
+++ b/db/filename.h
diff --git a/db/filename_test.cc b/db/filename_test.cc
index 4d2a91e..4d2a91e 100644..100755
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
diff --git a/db/log_format.h b/db/log_format.h
index 137cd4a..137cd4a 100644..100755
--- a/db/log_format.h
+++ b/db/log_format.h
diff --git a/db/log_reader.cc b/db/log_reader.cc
index 75e1d28..75e1d28 100644..100755
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
diff --git a/db/log_reader.h b/db/log_reader.h
index baf1475..baf1475 100644..100755
--- a/db/log_reader.h
+++ b/db/log_reader.h
diff --git a/db/log_test.cc b/db/log_test.cc
index 025a5ff..025a5ff 100644..100755
--- a/db/log_test.cc
+++ b/db/log_test.cc
diff --git a/db/log_writer.cc b/db/log_writer.cc
index 18ca37a..18ca37a 100644..100755
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
diff --git a/db/log_writer.h b/db/log_writer.h
index d3cf27d..d3cf27d 100644..100755
--- a/db/log_writer.h
+++ b/db/log_writer.h
diff --git a/db/memtable.cc b/db/memtable.cc
index a3b618a..a3b618a 100644..100755
--- a/db/memtable.cc
+++ b/db/memtable.cc
diff --git a/db/memtable.h b/db/memtable.h
index 45b3342..45b3342 100644..100755
--- a/db/memtable.h
+++ b/db/memtable.h
diff --git a/db/repair.cc b/db/repair.cc
index 014e00e..014e00e 100644..100755
--- a/db/repair.cc
+++ b/db/repair.cc
diff --git a/db/skiplist.h b/db/skiplist.h
index be39354..be39354 100644..100755
--- a/db/skiplist.h
+++ b/db/skiplist.h
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
index 5f9ec0d..5f9ec0d 100644..100755
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
diff --git a/db/snapshot.h b/db/snapshot.h
index 9a90756..9a90756 100644..100755
--- a/db/snapshot.h
+++ b/db/snapshot.h
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 325d707..325d707 100644..100755
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
diff --git a/db/table_cache.h b/db/table_cache.h
index 5376194..5376194 100644..100755
--- a/db/table_cache.h
+++ b/db/table_cache.h
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 809dd82..689dbe0 100644..100755
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -20,15 +20,18 @@ enum Tag {
kDeletedFile = 6,
kNewFile = 7,
kLargeValueRef = 8,
+ kPrevLogNumber = 9,
};
void VersionEdit::Clear() {
comparator_.clear();
log_number_ = 0;
+ prev_log_number_ = 0;
last_sequence_ = 0;
next_file_number_ = 0;
has_comparator_ = false;
has_log_number_ = false;
+ has_prev_log_number_ = false;
has_next_file_number_ = false;
has_last_sequence_ = false;
deleted_files_.clear();
@@ -45,6 +48,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
PutVarint32(dst, kLogNumber);
PutVarint64(dst, log_number_);
}
+ if (has_prev_log_number_) {
+ PutVarint32(dst, kPrevLogNumber);
+ PutVarint64(dst, prev_log_number_);
+ }
if (has_next_file_number_) {
PutVarint32(dst, kNextFileNumber);
PutVarint64(dst, next_file_number_);
@@ -142,6 +149,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
}
break;
+ case kPrevLogNumber:
+ if (GetVarint64(&input, &prev_log_number_)) {
+ has_prev_log_number_ = true;
+ } else {
+ msg = "previous log number";
+ }
+ break;
+
case kNextFileNumber:
if (GetVarint64(&input, &next_file_number_)) {
has_next_file_number_ = true;
@@ -228,6 +243,10 @@ std::string VersionEdit::DebugString() const {
r.append("\n LogNumber: ");
AppendNumberTo(&r, log_number_);
}
+ if (has_prev_log_number_) {
+ r.append("\n PrevLogNumber: ");
+ AppendNumberTo(&r, prev_log_number_);
+ }
if (has_next_file_number_) {
r.append("\n NextFile: ");
AppendNumberTo(&r, next_file_number_);
diff --git a/db/version_edit.h b/db/version_edit.h
index 1b71283..7e417b5 100644..100755
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -39,6 +39,10 @@ class VersionEdit {
has_log_number_ = true;
log_number_ = num;
}
+ void SetPrevLogNumber(uint64_t num) {
+ has_prev_log_number_ = true;
+ prev_log_number_ = num;
+ }
void SetNextFile(uint64_t num) {
has_next_file_number_ = true;
next_file_number_ = num;
@@ -95,10 +99,12 @@ class VersionEdit {
std::string comparator_;
uint64_t log_number_;
+ uint64_t prev_log_number_;
uint64_t next_file_number_;
SequenceNumber last_sequence_;
bool has_comparator_;
bool has_log_number_;
+ bool has_prev_log_number_;
bool has_next_file_number_;
bool has_last_sequence_;
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 6906ec3..6906ec3 100644..100755
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
diff --git a/db/version_set.cc b/db/version_set.cc
index dc9b418..31f79bb 100644..100755
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -27,16 +27,14 @@ static const int kTargetFileSize = 2 * 1048576;
static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize;
static double MaxBytesForLevel(int level) {
- if (level == 0) {
- return 4 * 1048576.0;
- } else {
- double result = 10 * 1048576.0;
- while (level > 1) {
- result *= 10;
- level--;
- }
- return result;
+ // Note: the result for level zero is not really used since we set
+ // the level-0 compaction threshold based on number of files.
+ double result = 10 * 1048576.0; // Result for both level-0 and level-1
+ while (level > 1) {
+ result *= 10;
+ level--;
}
+ return result;
}
static uint64_t MaxFileSizeForLevel(int level) {
@@ -327,6 +325,9 @@ VersionSet::VersionSet(const std::string& dbname,
icmp_(*cmp),
next_file_number_(2),
manifest_file_number_(0), // Filled by Recover()
+ last_sequence_(0),
+ log_number_(0),
+ prev_log_number_(0),
descriptor_file_(NULL),
descriptor_log_(NULL),
current_(new Version(this)),
@@ -345,7 +346,19 @@ VersionSet::~VersionSet() {
}
Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
+ if (edit->has_log_number_) {
+ assert(edit->log_number_ >= log_number_);
+ assert(edit->log_number_ < next_file_number_);
+ } else {
+ edit->SetLogNumber(log_number_);
+ }
+
+ if (!edit->has_prev_log_number_) {
+ edit->SetPrevLogNumber(prev_log_number_);
+ }
+
edit->SetNextFile(next_file_number_);
+ edit->SetLastSequence(last_sequence_);
Version* v = new Version(this);
{
@@ -372,7 +385,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
}
}
- // Write new record to log file
+ // Write new record to MANIFEST log
if (s.ok()) {
std::string record;
edit->EncodeTo(&record);
@@ -396,6 +409,8 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
v->next_ = NULL;
current_->next_ = v;
current_ = v;
+ log_number_ = edit->log_number_;
+ prev_log_number_ = edit->prev_log_number_;
} else {
delete v;
if (!new_manifest_file.empty()) {
@@ -406,13 +421,11 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
env_->DeleteFile(new_manifest_file);
}
}
- //Log(env_, options_->info_log, "State\n%s", current_->DebugString().c_str());
return s;
}
-Status VersionSet::Recover(uint64_t* log_number,
- SequenceNumber* last_sequence) {
+Status VersionSet::Recover() {
struct LogReporter : public log::Reader::Reporter {
Status* status;
virtual void Corruption(size_t bytes, const Status& s) {
@@ -439,9 +452,13 @@ Status VersionSet::Recover(uint64_t* log_number,
}
bool have_log_number = false;
+ bool have_prev_log_number = false;
bool have_next_file = false;
bool have_last_sequence = false;
uint64_t next_file = 0;
+ uint64_t last_sequence = 0;
+ uint64_t log_number = 0;
+ uint64_t prev_log_number = 0;
Builder builder(this, current_);
{
@@ -467,17 +484,22 @@ Status VersionSet::Recover(uint64_t* log_number,
}
if (edit.has_log_number_) {
- *log_number = edit.log_number_;
+ log_number = edit.log_number_;
have_log_number = true;
}
+ if (edit.has_prev_log_number_) {
+ prev_log_number = edit.prev_log_number_;
+ have_prev_log_number = true;
+ }
+
if (edit.has_next_file_number_) {
next_file = edit.next_file_number_;
have_next_file = true;
}
if (edit.has_last_sequence_) {
- *last_sequence = edit.last_sequence_;
+ last_sequence = edit.last_sequence_;
have_last_sequence = true;
}
}
@@ -493,6 +515,10 @@ Status VersionSet::Recover(uint64_t* log_number,
} else if (!have_last_sequence) {
s = Status::Corruption("no last-sequence-number entry in descriptor");
}
+
+ if (!have_prev_log_number) {
+ prev_log_number = 0;
+ }
}
if (s.ok()) {
@@ -508,12 +534,23 @@ Status VersionSet::Recover(uint64_t* log_number,
current_ = v;
manifest_file_number_ = next_file;
next_file_number_ = next_file + 1;
+ last_sequence_ = last_sequence;
+ log_number_ = log_number;
+ prev_log_number_ = prev_log_number;
}
}
return s;
}
+static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+ int64_t sum = 0;
+ for (int i = 0; i < files.size(); i++) {
+ sum += files[i]->file_size;
+ }
+ return sum;
+}
+
Status VersionSet::Finalize(Version* v) {
// Precomputed best level for next compaction
int best_level = -1;
@@ -523,23 +560,24 @@ Status VersionSet::Finalize(Version* v) {
for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
s = SortLevel(v, level);
- // Compute the ratio of current size to size limit.
- uint64_t level_bytes = 0;
- for (int i = 0; i < v->files_[level].size(); i++) {
- level_bytes += v->files_[level][i]->file_size;
- }
- double score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
-
+ double score;
if (level == 0) {
- // Level-0 file sizes are going to be often much smaller than
- // MaxBytesForLevel(0) since we do not account for compression
- // when producing a level-0 file; and too many level-0 files
- // increase merging costs. So use a file-count limit for
- // level-0 in addition to the byte-count limit.
- double count_score = v->files_[level].size() / 4.0;
- if (count_score > score) {
- score = count_score;
- }
+ // We treat level-0 specially by bounding the number of files
+ // instead of number of bytes for two reasons:
+ //
+ // (1) With larger write-buffer sizes, it is nice not to do too
+ // many level-0 compactions.
+ //
+ // (2) The files in level-0 are merged on every read and
+ // therefore we wish to avoid too many files when the individual
+ // file size is small (perhaps because of a small write-buffer
+ // setting, or very high compression ratios, or lots of
+ // overwrites/deletions).
+ score = v->files_[level].size() / 4.0;
+ } else {
+ // Compute the ratio of current size to size limit.
+ const uint64_t level_bytes = TotalFileSize(v->files_[level]);
+ score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
}
if (score > best_score) {
@@ -696,8 +734,7 @@ bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref,
return is_first;
}
-void VersionSet::CleanupLargeValueRefs(const std::set<uint64_t>& live_tables,
- uint64_t log_file_num) {
+void VersionSet::CleanupLargeValueRefs(const std::set<uint64_t>& live_tables) {
for (LargeValueMap::iterator it = large_value_refs_.begin();
it != large_value_refs_.end();
) {
@@ -705,7 +742,8 @@ void VersionSet::CleanupLargeValueRefs(const std::set<uint64_t>& live_tables,
for (LargeReferencesSet::iterator ref_it = refs->begin();
ref_it != refs->end();
) {
- if (ref_it->first != log_file_num && // Not in log file
+ if (ref_it->first != log_number_ && // Not in log file
+ ref_it->first != prev_log_number_ && // Not in prev log
live_tables.count(ref_it->first) == 0) { // Not in a live table
// No longer live: erase
LargeReferencesSet::iterator to_erase = ref_it;
@@ -762,12 +800,10 @@ void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
}
}
-static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
- int64_t sum = 0;
- for (int i = 0; i < files.size(); i++) {
- sum += files[i]->file_size;
- }
- return sum;
+int64_t VersionSet::NumLevelBytes(int level) const {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
+ return TotalFileSize(current_->files_[level]);
}
int64_t VersionSet::MaxNextLevelOverlappingBytes() {
diff --git a/db/version_set.h b/db/version_set.h
index a4199be..e1c5a4b 100644..100755
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -24,12 +24,6 @@
namespace leveldb {
-// Grouping of constants. We may want to make some of these
-// parameters set via options.
-namespace config {
-static const int kNumLevels = 7;
-}
-
namespace log { class Writer; }
class Compaction;
@@ -107,7 +101,7 @@ class VersionSet {
Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
// Recover the last saved descriptor from persistent storage.
- Status Recover(uint64_t* log_number, SequenceNumber* last_sequence);
+ Status Recover();
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);
@@ -124,6 +118,25 @@ class VersionSet {
// Return the number of Table files at the specified level.
int NumLevelFiles(int level) const;
+ // Return the combined file size of all files at the specified level.
+ int64_t NumLevelBytes(int level) const;
+
+ // Return the last sequence number.
+ uint64_t LastSequence() const { return last_sequence_; }
+
+ // Set the last sequence number to s.
+ void SetLastSequence(uint64_t s) {
+ assert(s >= last_sequence_);
+ last_sequence_ = s;
+ }
+
+ // Return the current log file number.
+ uint64_t LogNumber() const { return log_number_; }
+
+ // Return the log file number for the log file that is currently
+ // being compacted, or zero if there is no such log file.
+ uint64_t PrevLogNumber() const { return prev_log_number_; }
+
// Pick level and inputs for a new compaction.
// Returns NULL if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
@@ -168,9 +181,8 @@ class VersionSet {
// Cleanup the large value reference state by eliminating any
// references from files that are not includes in either "live_tables"
- // or "log_file".
- void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables,
- uint64_t log_file_num);
+ // or the current log.
+ void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables);
// Returns true if a large value with the given reference is live.
bool LargeValueIsLive(const LargeValueRef& large_ref);
@@ -213,6 +225,9 @@ class VersionSet {
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
+ uint64_t last_sequence_;
+ uint64_t log_number_;
+ uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
// Opened lazily
WritableFile* descriptor_file_;
diff --git a/db/write_batch.cc b/db/write_batch.cc
index e84e548..e84e548 100644..100755
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index ea28e2d..ea28e2d 100644..100755
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index deb8411..deb8411 100644..100755
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
diff --git a/doc/doc.css b/doc/doc.css
index 700c564..700c564 100644..100755
--- a/doc/doc.css
+++ b/doc/doc.css
diff --git a/doc/impl.html b/doc/impl.html
index b190d2c..b190d2c 100644..100755
--- a/doc/impl.html
+++ b/doc/impl.html
diff --git a/doc/index.html b/doc/index.html
index e0baf2e..2a83fc3 100644..100755
--- a/doc/index.html
+++ b/doc/index.html
@@ -63,15 +63,12 @@ Example:
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
modify/query the database. For example, the following code
moves the value stored under key1 to key2.
-<p>
<pre>
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
</pre>
-See <a href="#async">important performance note</a> below for how to
-speed up writes significantly.
<h1>Atomic Updates</h1>
<p>
@@ -100,6 +97,47 @@ we do not end up erroneously dropping the value entirely.
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
speed up bulk updates by placing lots of individual mutations into the
same batch.
+
+<h1>Synchronous Writes</h1>
+By default, each write to <code>leveldb</code> is asynchronous: it
+returns after pushing the write from the process into the operating
+system. The transfer from operating system memory to the underlying
+persistent storage happens asynchronously. The <code>sync</code> flag
+can be turned on for a particular write to make the write operation
+not return until the data being written has been pushed all the way to
+persistent storage. (On Posix systems, this is implemented by calling
+either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
+<code>msync(..., MS_SYNC)</code> before the write operation returns.)
+<pre>
+ leveldb::WriteOptions write_options;
+ write_options.sync = true;
+ db-&gt;Put(write_options, ...);
+</pre>
+Asynchronous writes are often more than a thousand times as fast as
+synchronous writes. The downside of asynchronous writes is that a
+crash of the machine may cause the last few updates to be lost. Note
+that a crash of just the writing process (i.e., not a reboot) will not
+cause any loss since even when <code>sync</code> is false, an update
+is pushed from the process memory into the operating system before it
+is considered done.
+
+<p>
+Asynchronous writes can often be used safely. For example, when
+loading a large amount of data into the database you can handle lost
+updates by restarting the bulk load after a crash. A hybrid scheme is
+also possible where every Nth write is synchronous, and in the event
+of a crash, the bulk load is restarted just after the last synchronous
+write finished by the previous run. (The synchronous write can update
+a marker that describes where to restart on a crash.)
+
+<p>
+<code>WriteBatch</code> provides an alternative to asynchronous writes.
+Multiple updates may be placed in the same <code>WriteBatch</code> and
+applied together using a synchronous write (i.e.,
+<code>write_options.sync</code> is set to true). The extra cost of
+the synchronous write will be amortized across all of the writes in
+the batch.
+
<p>
<h1>Concurrency</h1>
<p>
@@ -290,47 +328,11 @@ Performance can be tuned by changing the default values of the
types defined in <code>leveldb/include/options.h</code>.
<p>
-<h2><a name="async">Asynchronous Writes</a></h2>
-
-By default, each write to <code>leveldb</code> is synchronous: it does
-not return until the write has been pushed from memory to persistent
-storage. (On Posix systems, this is implemented by calling either
-<code>fdatasync(...)</code> or <code>msync(..., MS_SYNC)</code>.)
-<strong>Synchronous writes may be very slow and the synchrony can be
-optionally disabled</strong>:
-<pre>
- leveldb::WriteOptions write_options;
- write_options.sync = false;
- db-&gt;Put(write_options, ...);
-</pre>
-Asynchronous writes are often more than a hundred times as fast as
-synchronous writes. The downside of asynchronous writes is that a
-crash of the machine may cause the last few updates to be lost. Note
-that a crash of just the writing process (i.e., not a reboot) will not
-cause any loss since even when <code>sync</code> is false, an update
-is pushed from the process memory into the operating system before it
-is considered done.
-
-<p>
-Asynchronous writes can be particularly beneficial when loading a
-large amount of data into the database since you can mitigate the
-problem of lost updates by restarting the bulk load. A hybrid scheme
-is also possible where every Nth write is synchronous, and in the
-event of a crash, the bulk load is restarted just after the last
-synchronous write finished by the previous run.
-
-<p>
-<code>WriteBatch</code> provides an alternative to asynchronous writes.
-Multiple updates may be placed in the same <code>WriteBatch</code> and
-applied together using a synchronous write. The extra cost of the
-synchronous write will be amortized across all of the writes in the batch.
-
-<p>
<h2>Block size</h2>
<p>
<code>leveldb</code> groups adjacent keys together into the same block and such a
block is the unit of transfer to and from persistent storage. The
-default block size is approximately 8192 uncompressed bytes.
+default block size is approximately 4096 uncompressed bytes.
Applications that mostly do bulk scans over the contents of the
database may wish to increase this size. Applications that do a lot
of point reads of small values may wish to switch to a smaller block
diff --git a/doc/log_format.txt b/doc/log_format.txt
index 3a0414b..3a0414b 100644..100755
--- a/doc/log_format.txt
+++ b/doc/log_format.txt
diff --git a/doc/table_format.txt b/doc/table_format.txt
index ad5aa4b..ad5aa4b 100644..100755
--- a/doc/table_format.txt
+++ b/doc/table_format.txt
diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h
index 79196d1..79196d1 100644..100755
--- a/include/leveldb/cache.h
+++ b/include/leveldb/cache.h
diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h
index 4e00e4d..4e00e4d 100644..100755
--- a/include/leveldb/comparator.h
+++ b/include/leveldb/comparator.h
diff --git a/include/leveldb/db.h b/include/leveldb/db.h
index 74d50d3..f18ded3 100644..100755
--- a/include/leveldb/db.h
+++ b/include/leveldb/db.h
@@ -13,7 +13,7 @@
namespace leveldb {
static const int kMajorVersion = 1;
-static const int kMinorVersion = 0;
+static const int kMinorVersion = 1;
struct Options;
struct ReadOptions;
@@ -49,7 +49,7 @@ class DB {
// Set the database entry for "key" to "value". Returns OK on success,
// and a non-OK status on error.
- // Note: consider setting options.sync = false.
+ // Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options,
const Slice& key,
const Slice& value) = 0;
@@ -57,12 +57,12 @@ class DB {
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
- // Note: consider setting options.sync = false.
+ // Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
- // Note: consider setting options.sync = false.
+ // Note: consider setting options.sync = true.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
@@ -103,7 +103,9 @@ class DB {
//
// "leveldb.num-files-at-level<N>" - return the number of files at level <N>,
// where <N> is an ASCII representation of a level number (e.g. "0").
- virtual bool GetProperty(const Slice& property, uint64_t* value) = 0;
+ // "leveldb.stats" - returns a multi-line string that describes statistics
+ // about the internal operation of the DB.
+ virtual bool GetProperty(const Slice& property, std::string* value) = 0;
// For each i in [0,n-1], store in "sizes[i]", the approximate
// file system space used by keys in "[range[i].start .. range[i].limit)".
diff --git a/include/leveldb/env.h b/include/leveldb/env.h
index 4b6e712..4b6e712 100644..100755
--- a/include/leveldb/env.h
+++ b/include/leveldb/env.h
diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h
index 1866fb5..1866fb5 100644..100755
--- a/include/leveldb/iterator.h
+++ b/include/leveldb/iterator.h
diff --git a/include/leveldb/options.h b/include/leveldb/options.h
index 0b65624..87d388e 100644..100755
--- a/include/leveldb/options.h
+++ b/include/leveldb/options.h
@@ -69,15 +69,14 @@ struct Options {
// -------------------
// Parameters that affect performance
- // Amount of data to build up in memory before converting to an
- // on-disk file.
+ // Amount of data to build up in memory (backed by an unsorted log
+ // on disk) before converting to a sorted on-disk file.
//
- // Some DB operations may encounter a delay proportional to the size
- // of this parameter. Therefore we recommend against increasing
- // this parameter unless you are willing to live with an occasional
- // slow operation in exchange for faster bulk loading throughput.
+ // Larger values increase performance, especially during bulk loads.
+ // Up to two write buffers may be held in memory at the same time,
+ // so you may wish to adjust this parameter to control memory usage.
//
- // Default: 1MB
+ // Default: 4MB
size_t write_buffer_size;
// Number of open files that can be used by the DB. You may need to
@@ -100,7 +99,8 @@ struct Options {
// Control over blocks (user data is stored in a set of blocks, and
// a block is the unit of reading from disk).
- // Use the specified cache for blocks (if non-NULL).
+ // If non-NULL, use the specified cache for blocks.
+ // If NULL, leveldb will automatically create and use an 8MB internal cache.
// Default: NULL
Cache* block_cache;
@@ -109,7 +109,7 @@ struct Options {
// actual size of the unit read from disk may be smaller if
// compression is enabled. This parameter can be changed dynamically.
//
- // Default: 8K
+ // Default: 4K
int block_size;
// Number of keys between restart points for delta encoding of keys.
@@ -177,7 +177,12 @@ struct WriteOptions {
// crashes (i.e., the machine does not reboot), no writes will be
// lost even if sync==false.
//
- // Default: true
+ // In other words, a DB write with sync==false has similar
+ // crash semantics as the "write()" system call. A DB write
+ // with sync==true has similar crash semantics to a "write()"
+ // system call followed by "fsync()".
+ //
+ // Default: false
bool sync;
// If "post_write_snapshot" is non-NULL, and the write succeeds,
@@ -193,7 +198,7 @@ struct WriteOptions {
const Snapshot** post_write_snapshot;
WriteOptions()
- : sync(true),
+ : sync(false),
post_write_snapshot(NULL) {
}
};
diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h
index 62cb894..62cb894 100644..100755
--- a/include/leveldb/slice.h
+++ b/include/leveldb/slice.h
diff --git a/include/leveldb/status.h b/include/leveldb/status.h
index 47e3edf..47e3edf 100644..100755
--- a/include/leveldb/status.h
+++ b/include/leveldb/status.h
diff --git a/include/leveldb/table.h b/include/leveldb/table.h
index bd99176..bd99176 100644..100755
--- a/include/leveldb/table.h
+++ b/include/leveldb/table.h
diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h
index 49d2d51..49d2d51 100644..100755
--- a/include/leveldb/table_builder.h
+++ b/include/leveldb/table_builder.h
diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h
index 3411952..3411952 100644..100755
--- a/include/leveldb/write_batch.h
+++ b/include/leveldb/write_batch.h
diff --git a/leveldb.gyp b/leveldb.gyp
index d10ac33..d10ac33 100644..100755
--- a/leveldb.gyp
+++ b/leveldb.gyp
diff --git a/port/README b/port/README
index 422563e..422563e 100644..100755
--- a/port/README
+++ b/port/README
diff --git a/port/port.h b/port/port.h
index 816826b..816826b 100644..100755
--- a/port/port.h
+++ b/port/port.h
diff --git a/port/port_android.cc b/port/port_android.cc
index 8a74111..240e9ca 100644..100755
--- a/port/port_android.cc
+++ b/port/port_android.cc
@@ -24,7 +24,6 @@ int fdatasync(int fd) {
}
}
-// TODO(gabor): This is copied from port_posix.cc - not sure if I should do this?
namespace leveldb {
namespace port {
diff --git a/port/port_android.h b/port/port_android.h
index ca0362d..8680951 100644..100755
--- a/port/port_android.h
+++ b/port/port_android.h
@@ -15,6 +15,20 @@
#include <string>
#include <cctype>
+// Collapse the plethora of ARM flavors available to an easier to manage set
+// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto
+#if defined(__ARM_ARCH_6__) || \
+ defined(__ARM_ARCH_6J__) || \
+ defined(__ARM_ARCH_6K__) || \
+ defined(__ARM_ARCH_6Z__) || \
+ defined(__ARM_ARCH_6T2__) || \
+ defined(__ARM_ARCH_6ZK__) || \
+ defined(__ARM_ARCH_7__) || \
+ defined(__ARM_ARCH_7R__) || \
+ defined(__ARM_ARCH_7A__)
+#define ARMV6_OR_7 1
+#endif
+
extern "C" {
size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d);
size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d);
@@ -61,28 +75,50 @@ class CondVar {
pthread_cond_t cv_;
};
+#ifndef ARMV6_OR_7
+// On ARM chipsets <V6, 0xffff0fa0 is the hard coded address of a
+// memory barrier function provided by the kernel.
+typedef void (*LinuxKernelMemoryBarrierFunc)(void);
+LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier ATTRIBUTE_WEAK =
+ (LinuxKernelMemoryBarrierFunc) 0xffff0fa0;
+#endif
+
// Storage for a lock-free pointer
class AtomicPointer {
private:
- std::atomic<void*> rep_;
+ void* rep_;
+
+ inline void MemoryBarrier() const {
+ // TODO(gabor): This only works on Android instruction sets >= V6
+#ifdef ARMV6_OR_7
+ __asm__ __volatile__("dmb" : : : "memory");
+#else
+ pLinuxKernelMemoryBarrier();
+#endif
+ }
+
public:
AtomicPointer() { }
explicit AtomicPointer(void* v) : rep_(v) { }
inline void* Acquire_Load() const {
- return rep_.load(std::memory_order_acquire);
+ void* r = rep_;
+ MemoryBarrier();
+ return r;
}
inline void Release_Store(void* v) {
- rep_.store(v, std::memory_order_release);
+ MemoryBarrier();
+ rep_ = v;
}
inline void* NoBarrier_Load() const {
- return rep_.load(std::memory_order_relaxed);
+ void* r = rep_;
+ return r;
}
inline void NoBarrier_Store(void* v) {
- rep_.store(v, std::memory_order_relaxed);
+ rep_ = v;
}
};
-// TODO(gabor): Implement actual compress
+// TODO(gabor): Implement compress
inline bool Snappy_Compress(
const char* input,
size_t input_length,
@@ -90,7 +126,7 @@ inline bool Snappy_Compress(
return false;
}
-// TODO(gabor): Implement actual uncompress
+// TODO(gabor): Implement uncompress
inline bool Snappy_Uncompress(
const char* input_data,
size_t input_length,
diff --git a/port/port_chromium.cc b/port/port_chromium.cc
index 2ab49b9..2ab49b9 100644..100755
--- a/port/port_chromium.cc
+++ b/port/port_chromium.cc
diff --git a/port/port_chromium.h b/port/port_chromium.h
index e349f8f..e349f8f 100644..100755
--- a/port/port_chromium.h
+++ b/port/port_chromium.h
diff --git a/port/port_example.h b/port/port_example.h
index cf72617..cf72617 100644..100755
--- a/port/port_example.h
+++ b/port/port_example.h
diff --git a/port/port_posix.cc b/port/port_posix.cc
index e75da8b..e75da8b 100644..100755
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
diff --git a/port/port_posix.h b/port/port_posix.h
index 7adbc01..7adbc01 100644..100755
--- a/port/port_posix.h
+++ b/port/port_posix.h
diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc
index 8fa7277..8fa7277 100644..100755
--- a/port/sha1_portable.cc
+++ b/port/sha1_portable.cc
diff --git a/port/sha1_portable.h b/port/sha1_portable.h
index 31db305..31db305 100644..100755
--- a/port/sha1_portable.h
+++ b/port/sha1_portable.h
diff --git a/port/sha1_test.cc b/port/sha1_test.cc
index b182e67..b182e67 100644..100755
--- a/port/sha1_test.cc
+++ b/port/sha1_test.cc
diff --git a/port/win/stdint.h b/port/win/stdint.h
index 39edd0d..39edd0d 100644..100755
--- a/port/win/stdint.h
+++ b/port/win/stdint.h
diff --git a/table/block.cc b/table/block.cc
index 0525d2d..0525d2d 100644..100755
--- a/table/block.cc
+++ b/table/block.cc
diff --git a/table/block.h b/table/block.h
index cdf0598..cdf0598 100644..100755
--- a/table/block.h
+++ b/table/block.h
diff --git a/table/block_builder.cc b/table/block_builder.cc
index ae18b36..ae18b36 100644..100755
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
diff --git a/table/block_builder.h b/table/block_builder.h
index bf92a0f..bf92a0f 100644..100755
--- a/table/block_builder.h
+++ b/table/block_builder.h
diff --git a/table/format.cc b/table/format.cc
index 8c6b0f3..8c6b0f3 100644..100755
--- a/table/format.cc
+++ b/table/format.cc
diff --git a/table/format.h b/table/format.h
index a6ab964..a6ab964 100644..100755
--- a/table/format.h
+++ b/table/format.h
diff --git a/table/iterator.cc b/table/iterator.cc
index 4ddd55f..4ddd55f 100644..100755
--- a/table/iterator.cc
+++ b/table/iterator.cc
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index 158d3a7..158d3a7 100644..100755
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
diff --git a/table/merger.cc b/table/merger.cc
index 6ce06bb..6ce06bb 100644..100755
--- a/table/merger.cc
+++ b/table/merger.cc
diff --git a/table/merger.h b/table/merger.h
index 71d9dc5..71d9dc5 100644..100755
--- a/table/merger.h
+++ b/table/merger.h
diff --git a/table/table.cc b/table/table.cc
index 9820753..9820753 100644..100755
--- a/table/table.cc
+++ b/table/table.cc
diff --git a/table/table_builder.cc b/table/table_builder.cc
index 7ec7ad2..7ec7ad2 100644..100755
--- a/table/table_builder.cc
+++ b/table/table_builder.cc
diff --git a/table/table_test.cc b/table/table_test.cc
index e0c7134..4b3e85e 100644..100755
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -725,10 +725,10 @@ TEST(Harness, RandomizedLongDB) {
Test(&rnd);
// We must have created enough data to force merging
- uint64_t l0_files, l1_files;
+ std::string l0_files, l1_files;
ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level0", &l0_files));
ASSERT_TRUE(db()->GetProperty("leveldb.num-files-at-level1", &l1_files));
- ASSERT_GT(l0_files + l1_files, 0);
+ ASSERT_GT(atoi(l0_files.c_str()) + atoi(l1_files.c_str()), 0);
}
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
index 24a1241..24a1241 100644..100755
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
index 5909e2b..5909e2b 100644..100755
--- a/table/two_level_iterator.h
+++ b/table/two_level_iterator.h
diff --git a/util/arena.cc b/util/arena.cc
index 4bf6e36..4bf6e36 100644..100755
--- a/util/arena.cc
+++ b/util/arena.cc
diff --git a/util/arena.h b/util/arena.h
index fcb5d5b..fcb5d5b 100644..100755
--- a/util/arena.h
+++ b/util/arena.h
diff --git a/util/arena_test.cc b/util/arena_test.cc
index c33b552..c33b552 100644..100755
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
diff --git a/util/cache.cc b/util/cache.cc
index d8a4426..d8a4426 100644..100755
--- a/util/cache.cc
+++ b/util/cache.cc
diff --git a/util/cache_test.cc b/util/cache_test.cc
index dbab988..dbab988 100644..100755
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
diff --git a/util/coding.cc b/util/coding.cc
index 680e2ad..680e2ad 100644..100755
--- a/util/coding.cc
+++ b/util/coding.cc
diff --git a/util/coding.h b/util/coding.h
index 8755968..8755968 100644..100755
--- a/util/coding.h
+++ b/util/coding.h
diff --git a/util/coding_test.cc b/util/coding_test.cc
index a8dba04..a8dba04 100644..100755
--- a/util/coding_test.cc
+++ b/util/coding_test.cc
diff --git a/util/comparator.cc b/util/comparator.cc
index e2b27e3..e2b27e3 100644..100755
--- a/util/comparator.cc
+++ b/util/comparator.cc
diff --git a/util/crc32c.cc b/util/crc32c.cc
index 28c2401..28c2401 100644..100755
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
diff --git a/util/crc32c.h b/util/crc32c.h
index 938d8ff..938d8ff 100644..100755
--- a/util/crc32c.h
+++ b/util/crc32c.h
diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc
index ba9e804..ba9e804 100644..100755
--- a/util/crc32c_test.cc
+++ b/util/crc32c_test.cc
diff --git a/util/env.cc b/util/env.cc
index e5297e7..e5297e7 100644..100755
--- a/util/env.cc
+++ b/util/env.cc
diff --git a/util/env_chromium.cc b/util/env_chromium.cc
index 7edc7a9..7edc7a9 100644..100755
--- a/util/env_chromium.cc
+++ b/util/env_chromium.cc
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 5cddb0c..5cddb0c 100644..100755
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
diff --git a/util/env_test.cc b/util/env_test.cc
index 3c253be..3c253be 100644..100755
--- a/util/env_test.cc
+++ b/util/env_test.cc
diff --git a/util/hash.cc b/util/hash.cc
index d19afd1..d19afd1 100644..100755
--- a/util/hash.cc
+++ b/util/hash.cc
diff --git a/util/hash.h b/util/hash.h
index 8889d56..8889d56 100644..100755
--- a/util/hash.h
+++ b/util/hash.h
diff --git a/util/histogram.cc b/util/histogram.cc
index c5178ef..c5178ef 100644..100755
--- a/util/histogram.cc
+++ b/util/histogram.cc
diff --git a/util/histogram.h b/util/histogram.h
index f72f122..f72f122 100644..100755
--- a/util/histogram.h
+++ b/util/histogram.h
diff --git a/util/logging.cc b/util/logging.cc
index 5c9bd4a..5c9bd4a 100644..100755
--- a/util/logging.cc
+++ b/util/logging.cc
diff --git a/util/logging.h b/util/logging.h
index 1cd0a4b..1cd0a4b 100644..100755
--- a/util/logging.h
+++ b/util/logging.h
diff --git a/util/mutexlock.h b/util/mutexlock.h
index 05fe279..05fe279 100644..100755
--- a/util/mutexlock.h
+++ b/util/mutexlock.h
diff --git a/util/options.cc b/util/options.cc
index 421608b..29272fe 100644..100755
--- a/util/options.cc
+++ b/util/options.cc
@@ -16,11 +16,11 @@ Options::Options()
paranoid_checks(false),
env(Env::Default()),
info_log(NULL),
- write_buffer_size(1<<20),
+ write_buffer_size(4<<20),
max_open_files(1000),
large_value_threshold(65536),
block_cache(NULL),
- block_size(8192),
+ block_size(4096),
block_restart_interval(16),
compression(kSnappyCompression) {
}
diff --git a/util/random.h b/util/random.h
index 2d458e8..2d458e8 100644..100755
--- a/util/random.h
+++ b/util/random.h
diff --git a/util/status.cc b/util/status.cc
index d9b7195..d9b7195 100644..100755
--- a/util/status.cc
+++ b/util/status.cc
diff --git a/util/testharness.cc b/util/testharness.cc
index b686ac3..b686ac3 100644..100755
--- a/util/testharness.cc
+++ b/util/testharness.cc
diff --git a/util/testharness.h b/util/testharness.h
index 13ab914..13ab914 100644..100755
--- a/util/testharness.h
+++ b/util/testharness.h
diff --git a/util/testutil.cc b/util/testutil.cc
index 8d6cf3c..8d6cf3c 100644..100755
--- a/util/testutil.cc
+++ b/util/testutil.cc
diff --git a/util/testutil.h b/util/testutil.h
index a150c1a..a150c1a 100644..100755
--- a/util/testutil.h
+++ b/util/testutil.h