From 7e3b080f8517731db774d5d2587b9ded4f9716e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Tue, 28 Apr 2015 12:18:01 +0300 Subject: godeps: update leveldb and snappy, dump serpent-go --- .../github.com/syndtr/goleveldb/leveldb/batch.go | 228 ++--- .../syndtr/goleveldb/leveldb/batch_test.go | 26 +- .../syndtr/goleveldb/leveldb/bench2_test.go | 58 ++ .../syndtr/goleveldb/leveldb/bench_test.go | 15 +- .../syndtr/goleveldb/leveldb/cache/bench2_test.go | 30 + .../syndtr/goleveldb/leveldb/cache/cache.go | 713 +++++++++++++-- .../syndtr/goleveldb/leveldb/cache/cache_test.go | 564 +++++++++--- .../syndtr/goleveldb/leveldb/cache/empty_cache.go | 246 ------ .../syndtr/goleveldb/leveldb/cache/lru.go | 195 +++++ .../syndtr/goleveldb/leveldb/cache/lru_cache.go | 354 -------- .../github.com/syndtr/goleveldb/leveldb/config.go | 40 - .../syndtr/goleveldb/leveldb/corrupt_test.go | 76 +- .../src/github.com/syndtr/goleveldb/leveldb/db.go | 574 +++++++----- .../syndtr/goleveldb/leveldb/db_compaction.go | 767 +++++++++------- .../github.com/syndtr/goleveldb/leveldb/db_iter.go | 108 ++- .../syndtr/goleveldb/leveldb/db_snapshot.go | 148 ++-- .../syndtr/goleveldb/leveldb/db_state.go | 207 +++-- .../github.com/syndtr/goleveldb/leveldb/db_test.go | 959 +++++++++++++++++++-- .../github.com/syndtr/goleveldb/leveldb/db_util.go | 51 +- .../syndtr/goleveldb/leveldb/db_write.go | 180 ++-- .../src/github.com/syndtr/goleveldb/leveldb/doc.go | 10 + .../github.com/syndtr/goleveldb/leveldb/error.go | 38 - .../github.com/syndtr/goleveldb/leveldb/errors.go | 18 + .../syndtr/goleveldb/leveldb/errors/errors.go | 76 ++ .../syndtr/goleveldb/leveldb/external_test.go | 24 +- .../goleveldb/leveldb/iterator/array_iter.go | 30 +- .../goleveldb/leveldb/iterator/indexed_iter.go | 73 +- .../leveldb/iterator/indexed_iter_test.go | 2 +- .../syndtr/goleveldb/leveldb/iterator/iter.go | 27 +- .../goleveldb/leveldb/iterator/iter_suite_test.go | 8 +- .../goleveldb/leveldb/iterator/merged_iter.go | 29 +- .../syndtr/goleveldb/leveldb/journal/journal.go | 115 +-- .../goleveldb/leveldb/journal/journal_test.go | 490 +++++++++++ .../src/github.com/syndtr/goleveldb/leveldb/key.go | 133 +-- .../syndtr/goleveldb/leveldb/key_test.go | 94 +- .../syndtr/goleveldb/leveldb/leveldb_suite_test.go | 13 +- .../syndtr/goleveldb/leveldb/memdb/memdb.go | 32 +- .../goleveldb/leveldb/memdb/memdb_suite_test.go | 10 +- .../syndtr/goleveldb/leveldb/memdb/memdb_test.go | 2 +- .../syndtr/goleveldb/leveldb/opt/options.go | 409 ++++++++- .../github.com/syndtr/goleveldb/leveldb/options.go | 81 +- .../github.com/syndtr/goleveldb/leveldb/session.go | 306 ++++--- .../syndtr/goleveldb/leveldb/session_record.go | 225 ++--- .../goleveldb/leveldb/session_record_test.go | 18 +- .../syndtr/goleveldb/leveldb/session_util.go | 86 +- .../goleveldb/leveldb/storage/file_storage.go | 2 +- .../leveldb/storage/file_storage_solaris.go | 68 ++ .../goleveldb/leveldb/storage/file_storage_unix.go | 2 +- .../syndtr/goleveldb/leveldb/storage/storage.go | 34 +- .../syndtr/goleveldb/leveldb/storage_test.go | 158 +++- .../github.com/syndtr/goleveldb/leveldb/table.go | 317 ++++--- .../syndtr/goleveldb/leveldb/table/block_test.go | 30 +- .../syndtr/goleveldb/leveldb/table/reader.go | 701 ++++++++++----- .../syndtr/goleveldb/leveldb/table/table.go | 6 +- .../goleveldb/leveldb/table/table_suite_test.go | 8 +- .../syndtr/goleveldb/leveldb/table/table_test.go | 15 +- .../syndtr/goleveldb/leveldb/table/writer.go | 2 +- .../syndtr/goleveldb/leveldb/testutil/db.go | 8 +- .../syndtr/goleveldb/leveldb/testutil/ginkgo.go | 21 + .../syndtr/goleveldb/leveldb/testutil/kvtest.go | 141 ++- .../syndtr/goleveldb/leveldb/testutil/storage.go | 1 + .../syndtr/goleveldb/leveldb/testutil/util.go | 14 + .../syndtr/goleveldb/leveldb/testutil_test.go | 5 + .../github.com/syndtr/goleveldb/leveldb/util.go | 4 +- .../syndtr/goleveldb/leveldb/util/buffer_pool.go | 238 +++++ .../syndtr/goleveldb/leveldb/util/pool.go | 21 + .../syndtr/goleveldb/leveldb/util/pool_legacy.go | 33 + .../syndtr/goleveldb/leveldb/util/range.go | 16 + .../syndtr/goleveldb/leveldb/util/util.go | 32 +- .../github.com/syndtr/goleveldb/leveldb/version.go | 365 ++++---- .../github.com/syndtr/gosnappy/snappy/decode.go | 292 +++++++ .../github.com/syndtr/gosnappy/snappy/encode.go | 258 ++++++ .../github.com/syndtr/gosnappy/snappy/snappy.go | 68 ++ .../syndtr/gosnappy/snappy/snappy_test.go | 364 ++++++++ 74 files changed, 8005 insertions(+), 3107 deletions(-) create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/bench2_test.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/bench2_test.go delete mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/empty_cache.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/lru.go delete mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/lru_cache.go delete mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go delete mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/error.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/ginkgo.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/pool.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/pool_legacy.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/decode.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/encode.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy.go create mode 100644 Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy_test.go (limited to 'Godeps/_workspace/src/github.com/syndtr') diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go index 0d7911eca..ccf390c9c 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go @@ -8,65 +8,84 @@ package leveldb import ( "encoding/binary" - "errors" + "fmt" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/memdb" ) -var ( - errBatchTooShort = errors.New("leveldb: batch is too short") - errBatchBadRecord = errors.New("leveldb: bad record in batch") -) +type ErrBatchCorrupted struct { + Reason string +} + +func (e *ErrBatchCorrupted) Error() string { + return fmt.Sprintf("leveldb: batch corrupted: %s", e.Reason) +} + +func newErrBatchCorrupted(reason string) error { + return errors.NewErrCorrupted(nil, &ErrBatchCorrupted{reason}) +} -const kBatchHdrLen = 8 + 4 +const ( + batchHdrLen = 8 + 4 + batchGrowRec = 3000 +) -type batchReplay interface { - put(key, value []byte, seq uint64) - delete(key []byte, seq uint64) +type BatchReplay interface { + Put(key, value []byte) + Delete(key []byte) } // Batch is a write batch. type Batch struct { - buf []byte + data []byte rLen, bLen int seq uint64 sync bool } func (b *Batch) grow(n int) { - off := len(b.buf) + off := len(b.data) if off == 0 { - // include headers - off = kBatchHdrLen - n += off + off = batchHdrLen + if b.data != nil { + b.data = b.data[:off] + } } - if cap(b.buf)-off >= n { - return + if cap(b.data)-off < n { + if b.data == nil { + b.data = make([]byte, off, off+n) + } else { + odata := b.data + div := 1 + if b.rLen > batchGrowRec { + div = b.rLen / batchGrowRec + } + b.data = make([]byte, off, off+n+(off-batchHdrLen)/div) + copy(b.data, odata) + } } - buf := make([]byte, 2*cap(b.buf)+n) - copy(buf, b.buf) - b.buf = buf[:off] } -func (b *Batch) appendRec(t vType, key, value []byte) { +func (b *Batch) appendRec(kt kType, key, value []byte) { n := 1 + binary.MaxVarintLen32 + len(key) - if t == tVal { + if kt == ktVal { n += binary.MaxVarintLen32 + len(value) } b.grow(n) - off := len(b.buf) - buf := b.buf[:off+n] - buf[off] = byte(t) + off := len(b.data) + data := b.data[:off+n] + data[off] = byte(kt) off += 1 - off += binary.PutUvarint(buf[off:], uint64(len(key))) - copy(buf[off:], key) + off += binary.PutUvarint(data[off:], uint64(len(key))) + copy(data[off:], key) off += len(key) - if t == tVal { - off += binary.PutUvarint(buf[off:], uint64(len(value))) - copy(buf[off:], value) + if kt == ktVal { + off += binary.PutUvarint(data[off:], uint64(len(value))) + copy(data[off:], value) off += len(value) } - b.buf = buf[:off] + b.data = data[:off] b.rLen++ // Include 8-byte ikey header b.bLen += len(key) + len(value) + 8 @@ -75,18 +94,51 @@ func (b *Batch) appendRec(t vType, key, value []byte) { // Put appends 'put operation' of the given key/value pair to the batch. // It is safe to modify the contents of the argument after Put returns. func (b *Batch) Put(key, value []byte) { - b.appendRec(tVal, key, value) + b.appendRec(ktVal, key, value) } // Delete appends 'delete operation' of the given key to the batch. // It is safe to modify the contents of the argument after Delete returns. func (b *Batch) Delete(key []byte) { - b.appendRec(tDel, key, nil) + b.appendRec(ktDel, key, nil) +} + +// Dump dumps batch contents. The returned slice can be loaded into the +// batch using Load method. +// The returned slice is not its own copy, so the contents should not be +// modified. +func (b *Batch) Dump() []byte { + return b.encode() +} + +// Load loads given slice into the batch. Previous contents of the batch +// will be discarded. +// The given slice will not be copied and will be used as batch buffer, so +// it is not safe to modify the contents of the slice. +func (b *Batch) Load(data []byte) error { + return b.decode(0, data) +} + +// Replay replays batch contents. +func (b *Batch) Replay(r BatchReplay) error { + return b.decodeRec(func(i int, kt kType, key, value []byte) { + switch kt { + case ktVal: + r.Put(key, value) + case ktDel: + r.Delete(key) + } + }) +} + +// Len returns number of records in the batch. +func (b *Batch) Len() int { + return b.rLen } // Reset resets the batch. func (b *Batch) Reset() { - b.buf = nil + b.data = b.data[:0] b.seq = 0 b.rLen = 0 b.bLen = 0 @@ -97,24 +149,10 @@ func (b *Batch) init(sync bool) { b.sync = sync } -func (b *Batch) put(key, value []byte, seq uint64) { - if b.rLen == 0 { - b.seq = seq - } - b.Put(key, value) -} - -func (b *Batch) delete(key []byte, seq uint64) { - if b.rLen == 0 { - b.seq = seq - } - b.Delete(key) -} - func (b *Batch) append(p *Batch) { if p.rLen > 0 { - b.grow(len(p.buf) - kBatchHdrLen) - b.buf = append(b.buf, p.buf[kBatchHdrLen:]...) + b.grow(len(p.data) - batchHdrLen) + b.data = append(b.data, p.data[batchHdrLen:]...) b.rLen += p.rLen } if p.sync { @@ -122,95 +160,93 @@ func (b *Batch) append(p *Batch) { } } -func (b *Batch) len() int { - return b.rLen -} - +// size returns sums of key/value pair length plus 8-bytes ikey. func (b *Batch) size() int { return b.bLen } func (b *Batch) encode() []byte { b.grow(0) - binary.LittleEndian.PutUint64(b.buf, b.seq) - binary.LittleEndian.PutUint32(b.buf[8:], uint32(b.rLen)) + binary.LittleEndian.PutUint64(b.data, b.seq) + binary.LittleEndian.PutUint32(b.data[8:], uint32(b.rLen)) - return b.buf + return b.data } -func (b *Batch) decode(buf []byte) error { - if len(buf) < kBatchHdrLen { - return errBatchTooShort +func (b *Batch) decode(prevSeq uint64, data []byte) error { + if len(data) < batchHdrLen { + return newErrBatchCorrupted("too short") } - b.seq = binary.LittleEndian.Uint64(buf) - b.rLen = int(binary.LittleEndian.Uint32(buf[8:])) + b.seq = binary.LittleEndian.Uint64(data) + if b.seq < prevSeq { + return newErrBatchCorrupted("invalid sequence number") + } + b.rLen = int(binary.LittleEndian.Uint32(data[8:])) + if b.rLen < 0 { + return newErrBatchCorrupted("invalid records length") + } // No need to be precise at this point, it won't be used anyway - b.bLen = len(buf) - kBatchHdrLen - b.buf = buf + b.bLen = len(data) - batchHdrLen + b.data = data return nil } -func (b *Batch) decodeRec(f func(i int, t vType, key, value []byte)) error { - off := kBatchHdrLen +func (b *Batch) decodeRec(f func(i int, kt kType, key, value []byte)) (err error) { + off := batchHdrLen for i := 0; i < b.rLen; i++ { - if off >= len(b.buf) { - return errors.New("leveldb: invalid batch record length") + if off >= len(b.data) { + return newErrBatchCorrupted("invalid records length") } - t := vType(b.buf[off]) - if t > tVal { - return errors.New("leveldb: invalid batch record type in batch") + kt := kType(b.data[off]) + if kt > ktVal { + return newErrBatchCorrupted("bad record: invalid type") } off += 1 - x, n := binary.Uvarint(b.buf[off:]) + x, n := binary.Uvarint(b.data[off:]) off += n - if n <= 0 || off+int(x) > len(b.buf) { - return errBatchBadRecord + if n <= 0 || off+int(x) > len(b.data) { + return newErrBatchCorrupted("bad record: invalid key length") } - key := b.buf[off : off+int(x)] + key := b.data[off : off+int(x)] off += int(x) - var value []byte - if t == tVal { - x, n := binary.Uvarint(b.buf[off:]) + if kt == ktVal { + x, n := binary.Uvarint(b.data[off:]) off += n - if n <= 0 || off+int(x) > len(b.buf) { - return errBatchBadRecord + if n <= 0 || off+int(x) > len(b.data) { + return newErrBatchCorrupted("bad record: invalid value length") } - value = b.buf[off : off+int(x)] + value = b.data[off : off+int(x)] off += int(x) } - f(i, t, key, value) + f(i, kt, key, value) } return nil } -func (b *Batch) replay(to batchReplay) error { - return b.decodeRec(func(i int, t vType, key, value []byte) { - switch t { - case tVal: - to.put(key, value, b.seq+uint64(i)) - case tDel: - to.delete(key, b.seq+uint64(i)) - } - }) -} - func (b *Batch) memReplay(to *memdb.DB) error { - return b.decodeRec(func(i int, t vType, key, value []byte) { - ikey := newIKey(key, b.seq+uint64(i), t) + return b.decodeRec(func(i int, kt kType, key, value []byte) { + ikey := newIkey(key, b.seq+uint64(i), kt) to.Put(ikey, value) }) } +func (b *Batch) memDecodeAndReplay(prevSeq uint64, data []byte, to *memdb.DB) error { + if err := b.decode(prevSeq, data); err != nil { + return err + } + return b.memReplay(to) +} + func (b *Batch) revertMemReplay(to *memdb.DB) error { - return b.decodeRec(func(i int, t vType, key, value []byte) { - ikey := newIKey(key, b.seq+uint64(i), t) + return b.decodeRec(func(i int, kt kType, key, value []byte) { + ikey := newIkey(key, b.seq+uint64(i), kt) to.Delete(ikey) }) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go index 19b749b8f..7fc842f4f 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go @@ -15,7 +15,7 @@ import ( ) type tbRec struct { - t vType + kt kType key, value []byte } @@ -23,39 +23,39 @@ type testBatch struct { rec []*tbRec } -func (p *testBatch) put(key, value []byte, seq uint64) { - p.rec = append(p.rec, &tbRec{tVal, key, value}) +func (p *testBatch) Put(key, value []byte) { + p.rec = append(p.rec, &tbRec{ktVal, key, value}) } -func (p *testBatch) delete(key []byte, seq uint64) { - p.rec = append(p.rec, &tbRec{tDel, key, nil}) +func (p *testBatch) Delete(key []byte) { + p.rec = append(p.rec, &tbRec{ktDel, key, nil}) } func compareBatch(t *testing.T, b1, b2 *Batch) { if b1.seq != b2.seq { t.Errorf("invalid seq number want %d, got %d", b1.seq, b2.seq) } - if b1.len() != b2.len() { - t.Fatalf("invalid record length want %d, got %d", b1.len(), b2.len()) + if b1.Len() != b2.Len() { + t.Fatalf("invalid record length want %d, got %d", b1.Len(), b2.Len()) } p1, p2 := new(testBatch), new(testBatch) - err := b1.replay(p1) + err := b1.Replay(p1) if err != nil { t.Fatal("error when replaying batch 1: ", err) } - err = b2.replay(p2) + err = b2.Replay(p2) if err != nil { t.Fatal("error when replaying batch 2: ", err) } for i := range p1.rec { r1, r2 := p1.rec[i], p2.rec[i] - if r1.t != r2.t { - t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.t, r2.t) + if r1.kt != r2.kt { + t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.kt, r2.kt) } if !bytes.Equal(r1.key, r2.key) { t.Errorf("invalid key on record '%d' want %s, got %s", i, string(r1.key), string(r2.key)) } - if r1.t == tVal { + if r1.kt == ktVal { if !bytes.Equal(r1.value, r2.value) { t.Errorf("invalid value on record '%d' want %s, got %s", i, string(r1.value), string(r2.value)) } @@ -75,7 +75,7 @@ func TestBatch_EncodeDecode(t *testing.T) { b1.Delete([]byte("k")) buf := b1.encode() b2 := new(Batch) - err := b2.decode(buf) + err := b2.decode(0, buf) if err != nil { t.Error("error when decoding batch: ", err) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/bench2_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/bench2_test.go new file mode 100644 index 000000000..0dd60fd82 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/bench2_test.go @@ -0,0 +1,58 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build !go1.2 + +package leveldb + +import ( + "sync/atomic" + "testing" +) + +func BenchmarkDBReadConcurrent(b *testing.B) { + p := openDBBench(b, false) + p.populate(b.N) + p.fill() + p.gc() + defer p.close() + + b.ResetTimer() + b.SetBytes(116) + + b.RunParallel(func(pb *testing.PB) { + iter := p.newIter() + defer iter.Release() + for pb.Next() && iter.Next() { + } + }) +} + +func BenchmarkDBReadConcurrent2(b *testing.B) { + p := openDBBench(b, false) + p.populate(b.N) + p.fill() + p.gc() + defer p.close() + + b.ResetTimer() + b.SetBytes(116) + + var dir uint32 + b.RunParallel(func(pb *testing.PB) { + iter := p.newIter() + defer iter.Release() + if atomic.AddUint32(&dir, 1)%2 == 0 { + for pb.Next() && iter.Next() { + } + } else { + if pb.Next() && iter.Last() { + for pb.Next() && iter.Prev() { + } + } + } + }) +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/bench_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/bench_test.go index ea6801a89..91b426709 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/bench_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/bench_test.go @@ -170,7 +170,7 @@ func (p *dbBench) writes(perBatch int) { b.SetBytes(116) } -func (p *dbBench) drop() { +func (p *dbBench) gc() { p.keys, p.values = nil, nil runtime.GC() } @@ -249,6 +249,9 @@ func (p *dbBench) newIter() iterator.Iterator { } func (p *dbBench) close() { + if bp, err := p.db.GetProperty("leveldb.blockpool"); err == nil { + p.b.Log("Block pool stats: ", bp) + } p.db.Close() p.stor.Close() os.RemoveAll(benchDB) @@ -331,7 +334,7 @@ func BenchmarkDBRead(b *testing.B) { p := openDBBench(b, false) p.populate(b.N) p.fill() - p.drop() + p.gc() iter := p.newIter() b.ResetTimer() @@ -362,7 +365,7 @@ func BenchmarkDBReadUncompressed(b *testing.B) { p := openDBBench(b, true) p.populate(b.N) p.fill() - p.drop() + p.gc() iter := p.newIter() b.ResetTimer() @@ -379,7 +382,7 @@ func BenchmarkDBReadTable(b *testing.B) { p.populate(b.N) p.fill() p.reopen() - p.drop() + p.gc() iter := p.newIter() b.ResetTimer() @@ -395,7 +398,7 @@ func BenchmarkDBReadReverse(b *testing.B) { p := openDBBench(b, false) p.populate(b.N) p.fill() - p.drop() + p.gc() iter := p.newIter() b.ResetTimer() @@ -413,7 +416,7 @@ func BenchmarkDBReadReverseTable(b *testing.B) { p.populate(b.N) p.fill() p.reopen() - p.drop() + p.gc() iter := p.newIter() b.ResetTimer() diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/bench2_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/bench2_test.go new file mode 100644 index 000000000..175e22203 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/bench2_test.go @@ -0,0 +1,30 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build !go1.2 + +package cache + +import ( + "math/rand" + "testing" +) + +func BenchmarkLRUCache(b *testing.B) { + c := NewCache(NewLRU(10000)) + + b.SetParallelism(10) + b.RunParallel(func(pb *testing.PB) { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + for pb.Next() { + key := uint64(r.Intn(1000000)) + c.Get(0, key, func() (int, Value) { + return 1, key + }).Release() + } + }) +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache.go index 9b6a74977..c9670de5d 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache.go @@ -8,118 +8,669 @@ package cache import ( + "sync" "sync/atomic" + "unsafe" + + "github.com/syndtr/goleveldb/leveldb/util" ) -// SetFunc used by Namespace.Get method to create a cache object. SetFunc -// may return ok false, in that case the cache object will not be created. -type SetFunc func() (ok bool, value interface{}, charge int, fin SetFin) +// Cacher provides interface to implements a caching functionality. +// An implementation must be goroutine-safe. +type Cacher interface { + // Capacity returns cache capacity. + Capacity() int -// SetFin will be called when corresponding cache object are released. -type SetFin func() + // SetCapacity sets cache capacity. + SetCapacity(capacity int) -// DelFin will be called when corresponding cache object are released. -// DelFin will be called after SetFin. The exist is true if the corresponding -// cache object is actually exist in the cache tree. -type DelFin func(exist bool) + // Promote promotes the 'cache node'. + Promote(n *Node) -// PurgeFin will be called when corresponding cache object are released. -// PurgeFin will be called after SetFin. If PurgeFin present DelFin will -// not be executed but passed to the PurgeFin, it is up to the caller -// to call it or not. -type PurgeFin func(ns, key uint64, delfin DelFin) + // Ban evicts the 'cache node' and prevent subsequent 'promote'. + Ban(n *Node) -// Cache is a cache tree. -type Cache interface { - // SetCapacity sets cache capacity. - SetCapacity(capacity int) + // Evict evicts the 'cache node'. + Evict(n *Node) - // GetNamespace gets or creates a cache namespace for the given id. - GetNamespace(id uint64) Namespace + // EvictNS evicts 'cache node' with the given namespace. + EvictNS(ns uint64) - // Purge purges all cache namespaces, read Namespace.Purge method documentation. - Purge(fin PurgeFin) + // EvictAll evicts all 'cache node'. + EvictAll() + + // Close closes the 'cache tree' + Close() error +} + +// Value is a 'cacheable object'. It may implements util.Releaser, if +// so the the Release method will be called once object is released. +type Value interface{} + +type CacheGetter struct { + Cache *Cache + NS uint64 +} - // Zap zaps all cache namespaces, read Namespace.Zap method documentation. - Zap(closed bool) +func (g *CacheGetter) Get(key uint64, setFunc func() (size int, value Value)) *Handle { + return g.Cache.Get(g.NS, key, setFunc) } -// Namespace is a cache namespace. -type Namespace interface { - // Get gets cache object for the given key. The given SetFunc (if not nil) will - // be called if the given key does not exist. - // If the given key does not exist, SetFunc is nil or SetFunc return ok false, Get - // will return ok false. - Get(key uint64, setf SetFunc) (obj Object, ok bool) +// The hash tables implementation is based on: +// "Dynamic-Sized Nonblocking Hash Tables", by Yujie Liu, Kunlong Zhang, and Michael Spear. ACM Symposium on Principles of Distributed Computing, Jul 2014. - // Get deletes cache object for the given key. If exist the cache object will - // be deleted later when all of its handles have been released (i.e. no one use - // it anymore) and the given DelFin (if not nil) will finally be executed. If - // such cache object does not exist the given DelFin will be executed anyway. - // - // Delete returns true if such cache object exist. - Delete(key uint64, fin DelFin) bool +const ( + mInitialSize = 1 << 4 + mOverflowThreshold = 1 << 5 + mOverflowGrowThreshold = 1 << 7 +) - // Purge deletes all cache objects, read Delete method documentation. - Purge(fin PurgeFin) +type mBucket struct { + mu sync.Mutex + node []*Node + frozen bool +} - // Zap detaches the namespace from the cache tree and delete all its cache - // objects. The cache objects deletion and finalizers execution are happen - // immediately, even if its existing handles haven't yet been released. - // A zapped namespace can't never be filled again. - // If closed is false then the Get function will always call the given SetFunc - // if it is not nil, but resultant of the SetFunc will not be cached. - Zap(closed bool) +func (b *mBucket) freeze() []*Node { + b.mu.Lock() + defer b.mu.Unlock() + if !b.frozen { + b.frozen = true + } + return b.node } -// Object is a cache object. -type Object interface { - // Release releases the cache object. Other methods should not be called - // after the cache object has been released. - Release() +func (b *mBucket) get(r *Cache, h *mNode, hash uint32, ns, key uint64, noset bool) (done, added bool, n *Node) { + b.mu.Lock() + + if b.frozen { + b.mu.Unlock() + return + } + + // Scan the node. + for _, n := range b.node { + if n.hash == hash && n.ns == ns && n.key == key { + atomic.AddInt32(&n.ref, 1) + b.mu.Unlock() + return true, false, n + } + } + + // Get only. + if noset { + b.mu.Unlock() + return true, false, nil + } + + // Create node. + n = &Node{ + r: r, + hash: hash, + ns: ns, + key: key, + ref: 1, + } + // Add node to bucket. + b.node = append(b.node, n) + bLen := len(b.node) + b.mu.Unlock() + + // Update counter. + grow := atomic.AddInt32(&r.nodes, 1) >= h.growThreshold + if bLen > mOverflowThreshold { + grow = grow || atomic.AddInt32(&h.overflow, 1) >= mOverflowGrowThreshold + } - // Value returns value of the cache object. - Value() interface{} + // Grow. + if grow && atomic.CompareAndSwapInt32(&h.resizeInProgess, 0, 1) { + nhLen := len(h.buckets) << 1 + nh := &mNode{ + buckets: make([]unsafe.Pointer, nhLen), + mask: uint32(nhLen) - 1, + pred: unsafe.Pointer(h), + growThreshold: int32(nhLen * mOverflowThreshold), + shrinkThreshold: int32(nhLen >> 1), + } + ok := atomic.CompareAndSwapPointer(&r.mHead, unsafe.Pointer(h), unsafe.Pointer(nh)) + if !ok { + panic("BUG: failed swapping head") + } + go nh.initBuckets() + } + + return true, true, n } -// Namespace state. -type nsState int +func (b *mBucket) delete(r *Cache, h *mNode, hash uint32, ns, key uint64) (done, deleted bool) { + b.mu.Lock() -const ( - nsEffective nsState = iota - nsZapped - nsClosed -) + if b.frozen { + b.mu.Unlock() + return + } -// Node state. -type nodeState int + // Scan the node. + var ( + n *Node + bLen int + ) + for i := range b.node { + n = b.node[i] + if n.ns == ns && n.key == key { + if atomic.LoadInt32(&n.ref) == 0 { + deleted = true -const ( - nodeEffective nodeState = iota - nodeEvicted - nodeRemoved -) + // Call releaser. + if n.value != nil { + if r, ok := n.value.(util.Releaser); ok { + r.Release() + } + n.value = nil + } + + // Remove node from bucket. + b.node = append(b.node[:i], b.node[i+1:]...) + bLen = len(b.node) + } + break + } + } + b.mu.Unlock() -// Fake object. -type fakeObject struct { - value interface{} - fin func() - once uint32 + if deleted { + // Call OnDel. + for _, f := range n.onDel { + f() + } + + // Update counter. + atomic.AddInt32(&r.size, int32(n.size)*-1) + shrink := atomic.AddInt32(&r.nodes, -1) < h.shrinkThreshold + if bLen >= mOverflowThreshold { + atomic.AddInt32(&h.overflow, -1) + } + + // Shrink. + if shrink && len(h.buckets) > mInitialSize && atomic.CompareAndSwapInt32(&h.resizeInProgess, 0, 1) { + nhLen := len(h.buckets) >> 1 + nh := &mNode{ + buckets: make([]unsafe.Pointer, nhLen), + mask: uint32(nhLen) - 1, + pred: unsafe.Pointer(h), + growThreshold: int32(nhLen * mOverflowThreshold), + shrinkThreshold: int32(nhLen >> 1), + } + ok := atomic.CompareAndSwapPointer(&r.mHead, unsafe.Pointer(h), unsafe.Pointer(nh)) + if !ok { + panic("BUG: failed swapping head") + } + go nh.initBuckets() + } + } + + return true, deleted } -func (o *fakeObject) Value() interface{} { - if atomic.LoadUint32(&o.once) == 0 { - return o.value +type mNode struct { + buckets []unsafe.Pointer // []*mBucket + mask uint32 + pred unsafe.Pointer // *mNode + resizeInProgess int32 + + overflow int32 + growThreshold int32 + shrinkThreshold int32 +} + +func (n *mNode) initBucket(i uint32) *mBucket { + if b := (*mBucket)(atomic.LoadPointer(&n.buckets[i])); b != nil { + return b + } + + p := (*mNode)(atomic.LoadPointer(&n.pred)) + if p != nil { + var node []*Node + if n.mask > p.mask { + // Grow. + pb := (*mBucket)(atomic.LoadPointer(&p.buckets[i&p.mask])) + if pb == nil { + pb = p.initBucket(i & p.mask) + } + m := pb.freeze() + // Split nodes. + for _, x := range m { + if x.hash&n.mask == i { + node = append(node, x) + } + } + } else { + // Shrink. + pb0 := (*mBucket)(atomic.LoadPointer(&p.buckets[i])) + if pb0 == nil { + pb0 = p.initBucket(i) + } + pb1 := (*mBucket)(atomic.LoadPointer(&p.buckets[i+uint32(len(n.buckets))])) + if pb1 == nil { + pb1 = p.initBucket(i + uint32(len(n.buckets))) + } + m0 := pb0.freeze() + m1 := pb1.freeze() + // Merge nodes. + node = make([]*Node, 0, len(m0)+len(m1)) + node = append(node, m0...) + node = append(node, m1...) + } + b := &mBucket{node: node} + if atomic.CompareAndSwapPointer(&n.buckets[i], nil, unsafe.Pointer(b)) { + if len(node) > mOverflowThreshold { + atomic.AddInt32(&n.overflow, int32(len(node)-mOverflowThreshold)) + } + return b + } + } + + return (*mBucket)(atomic.LoadPointer(&n.buckets[i])) +} + +func (n *mNode) initBuckets() { + for i := range n.buckets { + n.initBucket(uint32(i)) + } + atomic.StorePointer(&n.pred, nil) +} + +// Cache is a 'cache map'. +type Cache struct { + mu sync.RWMutex + mHead unsafe.Pointer // *mNode + nodes int32 + size int32 + cacher Cacher + closed bool +} + +// NewCache creates a new 'cache map'. The cacher is optional and +// may be nil. +func NewCache(cacher Cacher) *Cache { + h := &mNode{ + buckets: make([]unsafe.Pointer, mInitialSize), + mask: mInitialSize - 1, + growThreshold: int32(mInitialSize * mOverflowThreshold), + shrinkThreshold: 0, + } + for i := range h.buckets { + h.buckets[i] = unsafe.Pointer(&mBucket{}) + } + r := &Cache{ + mHead: unsafe.Pointer(h), + cacher: cacher, + } + return r +} + +func (r *Cache) getBucket(hash uint32) (*mNode, *mBucket) { + h := (*mNode)(atomic.LoadPointer(&r.mHead)) + i := hash & h.mask + b := (*mBucket)(atomic.LoadPointer(&h.buckets[i])) + if b == nil { + b = h.initBucket(i) + } + return h, b +} + +func (r *Cache) delete(n *Node) bool { + for { + h, b := r.getBucket(n.hash) + done, deleted := b.delete(r, h, n.hash, n.ns, n.key) + if done { + return deleted + } + } + return false +} + +// Nodes returns number of 'cache node' in the map. +func (r *Cache) Nodes() int { + return int(atomic.LoadInt32(&r.nodes)) +} + +// Size returns sums of 'cache node' size in the map. +func (r *Cache) Size() int { + return int(atomic.LoadInt32(&r.size)) +} + +// Capacity returns cache capacity. +func (r *Cache) Capacity() int { + if r.cacher == nil { + return 0 + } + return r.cacher.Capacity() +} + +// SetCapacity sets cache capacity. +func (r *Cache) SetCapacity(capacity int) { + if r.cacher != nil { + r.cacher.SetCapacity(capacity) + } +} + +// Get gets 'cache node' with the given namespace and key. +// If cache node is not found and setFunc is not nil, Get will atomically creates +// the 'cache node' by calling setFunc. Otherwise Get will returns nil. +// +// The returned 'cache handle' should be released after use by calling Release +// method. +func (r *Cache) Get(ns, key uint64, setFunc func() (size int, value Value)) *Handle { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return nil + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, setFunc == nil) + if done { + if n != nil { + n.mu.Lock() + if n.value == nil { + if setFunc == nil { + n.mu.Unlock() + n.unref() + return nil + } + + n.size, n.value = setFunc() + if n.value == nil { + n.size = 0 + n.mu.Unlock() + n.unref() + return nil + } + atomic.AddInt32(&r.size, int32(n.size)) + } + n.mu.Unlock() + if r.cacher != nil { + r.cacher.Promote(n) + } + return &Handle{unsafe.Pointer(n)} + } + + break + } } return nil } -func (o *fakeObject) Release() { - if !atomic.CompareAndSwapUint32(&o.once, 0, 1) { +// Delete removes and ban 'cache node' with the given namespace and key. +// A banned 'cache node' will never inserted into the 'cache tree'. Ban +// only attributed to the particular 'cache node', so when a 'cache node' +// is recreated it will not be banned. +// +// If onDel is not nil, then it will be executed if such 'cache node' +// doesn't exist or once the 'cache node' is released. +// +// Delete return true is such 'cache node' exist. +func (r *Cache) Delete(ns, key uint64, onDel func()) bool { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return false + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, true) + if done { + if n != nil { + if onDel != nil { + n.mu.Lock() + n.onDel = append(n.onDel, onDel) + n.mu.Unlock() + } + if r.cacher != nil { + r.cacher.Ban(n) + } + n.unref() + return true + } + + break + } + } + + if onDel != nil { + onDel() + } + + return false +} + +// Evict evicts 'cache node' with the given namespace and key. This will +// simply call Cacher.Evict. +// +// Evict return true is such 'cache node' exist. +func (r *Cache) Evict(ns, key uint64) bool { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return false + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, true) + if done { + if n != nil { + if r.cacher != nil { + r.cacher.Evict(n) + } + n.unref() + return true + } + + break + } + } + + return false +} + +// EvictNS evicts 'cache node' with the given namespace. This will +// simply call Cacher.EvictNS. +func (r *Cache) EvictNS(ns uint64) { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return + } + + if r.cacher != nil { + r.cacher.EvictNS(ns) + } +} + +// EvictAll evicts all 'cache node'. This will simply call Cacher.EvictAll. +func (r *Cache) EvictAll() { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { return } - if o.fin != nil { - o.fin() - o.fin = nil + + if r.cacher != nil { + r.cacher.EvictAll() + } +} + +// Close closes the 'cache map' and releases all 'cache node'. +func (r *Cache) Close() error { + r.mu.Lock() + if !r.closed { + r.closed = true + + if r.cacher != nil { + if err := r.cacher.Close(); err != nil { + return err + } + } + + h := (*mNode)(r.mHead) + h.initBuckets() + + for i := range h.buckets { + b := (*mBucket)(h.buckets[i]) + for _, n := range b.node { + // Call releaser. + if n.value != nil { + if r, ok := n.value.(util.Releaser); ok { + r.Release() + } + n.value = nil + } + + // Call OnDel. + for _, f := range n.onDel { + f() + } + } + } } + r.mu.Unlock() + return nil +} + +// Node is a 'cache node'. +type Node struct { + r *Cache + + hash uint32 + ns, key uint64 + + mu sync.Mutex + size int + value Value + + ref int32 + onDel []func() + + CacheData unsafe.Pointer +} + +// NS returns this 'cache node' namespace. +func (n *Node) NS() uint64 { + return n.ns +} + +// Key returns this 'cache node' key. +func (n *Node) Key() uint64 { + return n.key +} + +// Size returns this 'cache node' size. +func (n *Node) Size() int { + return n.size +} + +// Value returns this 'cache node' value. +func (n *Node) Value() Value { + return n.value +} + +// Ref returns this 'cache node' ref counter. +func (n *Node) Ref() int32 { + return atomic.LoadInt32(&n.ref) +} + +// GetHandle returns an handle for this 'cache node'. +func (n *Node) GetHandle() *Handle { + if atomic.AddInt32(&n.ref, 1) <= 1 { + panic("BUG: Node.GetHandle on zero ref") + } + return &Handle{unsafe.Pointer(n)} +} + +func (n *Node) unref() { + if atomic.AddInt32(&n.ref, -1) == 0 { + n.r.delete(n) + } +} + +func (n *Node) unrefLocked() { + if atomic.AddInt32(&n.ref, -1) == 0 { + n.r.mu.RLock() + if !n.r.closed { + n.r.delete(n) + } + n.r.mu.RUnlock() + } +} + +type Handle struct { + n unsafe.Pointer // *Node +} + +func (h *Handle) Value() Value { + n := (*Node)(atomic.LoadPointer(&h.n)) + if n != nil { + return n.value + } + return nil +} + +func (h *Handle) Release() { + nPtr := atomic.LoadPointer(&h.n) + if nPtr != nil && atomic.CompareAndSwapPointer(&h.n, nPtr, nil) { + n := (*Node)(nPtr) + n.unrefLocked() + } +} + +func murmur32(ns, key uint64, seed uint32) uint32 { + const ( + m = uint32(0x5bd1e995) + r = 24 + ) + + k1 := uint32(ns >> 32) + k2 := uint32(ns) + k3 := uint32(key >> 32) + k4 := uint32(key) + + k1 *= m + k1 ^= k1 >> r + k1 *= m + + k2 *= m + k2 ^= k2 >> r + k2 *= m + + k3 *= m + k3 ^= k3 >> r + k3 *= m + + k4 *= m + k4 ^= k4 >> r + k4 *= m + + h := seed + + h *= m + h ^= k1 + h *= m + h ^= k2 + h *= m + h ^= k3 + h *= m + h ^= k4 + + h ^= h >> 13 + h *= m + h ^= h >> 15 + + return h } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go index 07a9939b2..c2a50156f 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go @@ -8,17 +8,289 @@ package cache import ( "math/rand" + "runtime" + "sync" + "sync/atomic" "testing" + "time" + "unsafe" ) -func set(ns Namespace, key uint64, value interface{}, charge int, fin func()) Object { - obj, _ := ns.Get(key, func() (bool, interface{}, int, SetFin) { - return true, value, charge, fin +type int32o int32 + +func (o *int32o) acquire() { + if atomic.AddInt32((*int32)(o), 1) != 1 { + panic("BUG: invalid ref") + } +} + +func (o *int32o) Release() { + if atomic.AddInt32((*int32)(o), -1) != 0 { + panic("BUG: invalid ref") + } +} + +type releaserFunc struct { + fn func() + value Value +} + +func (r releaserFunc) Release() { + if r.fn != nil { + r.fn() + } +} + +func set(c *Cache, ns, key uint64, value Value, charge int, relf func()) *Handle { + return c.Get(ns, key, func() (int, Value) { + if relf != nil { + return charge, releaserFunc{relf, value} + } else { + return charge, value + } + }) +} + +func TestCacheMap(t *testing.T) { + runtime.GOMAXPROCS(runtime.NumCPU()) + + nsx := []struct { + nobjects, nhandles, concurrent, repeat int + }{ + {10000, 400, 50, 3}, + {100000, 1000, 100, 10}, + } + + var ( + objects [][]int32o + handles [][]unsafe.Pointer + ) + + for _, x := range nsx { + objects = append(objects, make([]int32o, x.nobjects)) + handles = append(handles, make([]unsafe.Pointer, x.nhandles)) + } + + c := NewCache(nil) + + wg := new(sync.WaitGroup) + var done int32 + + for ns, x := range nsx { + for i := 0; i < x.concurrent; i++ { + wg.Add(1) + go func(ns, i, repeat int, objects []int32o, handles []unsafe.Pointer) { + defer wg.Done() + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + for j := len(objects) * repeat; j >= 0; j-- { + key := uint64(r.Intn(len(objects))) + h := c.Get(uint64(ns), key, func() (int, Value) { + o := &objects[key] + o.acquire() + return 1, o + }) + if v := h.Value().(*int32o); v != &objects[key] { + t.Fatalf("#%d invalid value: want=%p got=%p", ns, &objects[key], v) + } + if objects[key] != 1 { + t.Fatalf("#%d invalid object %d: %d", ns, key, objects[key]) + } + if !atomic.CompareAndSwapPointer(&handles[r.Intn(len(handles))], nil, unsafe.Pointer(h)) { + h.Release() + } + } + }(ns, i, x.repeat, objects[ns], handles[ns]) + } + + go func(handles []unsafe.Pointer) { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + for atomic.LoadInt32(&done) == 0 { + i := r.Intn(len(handles)) + h := (*Handle)(atomic.LoadPointer(&handles[i])) + if h != nil && atomic.CompareAndSwapPointer(&handles[i], unsafe.Pointer(h), nil) { + h.Release() + } + time.Sleep(time.Millisecond) + } + }(handles[ns]) + } + + go func() { + handles := make([]*Handle, 100000) + for atomic.LoadInt32(&done) == 0 { + for i := range handles { + handles[i] = c.Get(999999999, uint64(i), func() (int, Value) { + return 1, 1 + }) + } + for _, h := range handles { + h.Release() + } + } + }() + + wg.Wait() + + atomic.StoreInt32(&done, 1) + + for _, handles0 := range handles { + for i := range handles0 { + h := (*Handle)(atomic.LoadPointer(&handles0[i])) + if h != nil && atomic.CompareAndSwapPointer(&handles0[i], unsafe.Pointer(h), nil) { + h.Release() + } + } + } + + for ns, objects0 := range objects { + for i, o := range objects0 { + if o != 0 { + t.Fatalf("invalid object #%d.%d: ref=%d", ns, i, o) + } + } + } +} + +func TestCacheMap_NodesAndSize(t *testing.T) { + c := NewCache(nil) + if c.Nodes() != 0 { + t.Errorf("invalid nodes counter: want=%d got=%d", 0, c.Nodes()) + } + if c.Size() != 0 { + t.Errorf("invalid size counter: want=%d got=%d", 0, c.Size()) + } + set(c, 0, 1, 1, 1, nil) + set(c, 0, 2, 2, 2, nil) + set(c, 1, 1, 3, 3, nil) + set(c, 2, 1, 4, 1, nil) + if c.Nodes() != 4 { + t.Errorf("invalid nodes counter: want=%d got=%d", 4, c.Nodes()) + } + if c.Size() != 7 { + t.Errorf("invalid size counter: want=%d got=%d", 4, c.Size()) + } +} + +func TestLRUCache_Capacity(t *testing.T) { + c := NewCache(NewLRU(10)) + if c.Capacity() != 10 { + t.Errorf("invalid capacity: want=%d got=%d", 10, c.Capacity()) + } + set(c, 0, 1, 1, 1, nil).Release() + set(c, 0, 2, 2, 2, nil).Release() + set(c, 1, 1, 3, 3, nil).Release() + set(c, 2, 1, 4, 1, nil).Release() + set(c, 2, 2, 5, 1, nil).Release() + set(c, 2, 3, 6, 1, nil).Release() + set(c, 2, 4, 7, 1, nil).Release() + set(c, 2, 5, 8, 1, nil).Release() + if c.Nodes() != 7 { + t.Errorf("invalid nodes counter: want=%d got=%d", 7, c.Nodes()) + } + if c.Size() != 10 { + t.Errorf("invalid size counter: want=%d got=%d", 10, c.Size()) + } + c.SetCapacity(9) + if c.Capacity() != 9 { + t.Errorf("invalid capacity: want=%d got=%d", 9, c.Capacity()) + } + if c.Nodes() != 6 { + t.Errorf("invalid nodes counter: want=%d got=%d", 6, c.Nodes()) + } + if c.Size() != 8 { + t.Errorf("invalid size counter: want=%d got=%d", 8, c.Size()) + } +} + +func TestCacheMap_NilValue(t *testing.T) { + c := NewCache(NewLRU(10)) + h := c.Get(0, 0, func() (size int, value Value) { + return 1, nil }) - return obj + if h != nil { + t.Error("cache handle is non-nil") + } + if c.Nodes() != 0 { + t.Errorf("invalid nodes counter: want=%d got=%d", 0, c.Nodes()) + } + if c.Size() != 0 { + t.Errorf("invalid size counter: want=%d got=%d", 0, c.Size()) + } } -func TestCache_HitMiss(t *testing.T) { +func TestLRUCache_GetLatency(t *testing.T) { + runtime.GOMAXPROCS(runtime.NumCPU()) + + const ( + concurrentSet = 30 + concurrentGet = 3 + duration = 3 * time.Second + delay = 3 * time.Millisecond + maxkey = 100000 + ) + + var ( + set, getHit, getAll int32 + getMaxLatency, getDuration int64 + ) + + c := NewCache(NewLRU(5000)) + wg := &sync.WaitGroup{} + until := time.Now().Add(duration) + for i := 0; i < concurrentSet; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + r := rand.New(rand.NewSource(time.Now().UnixNano())) + for time.Now().Before(until) { + c.Get(0, uint64(r.Intn(maxkey)), func() (int, Value) { + time.Sleep(delay) + atomic.AddInt32(&set, 1) + return 1, 1 + }).Release() + } + }(i) + } + for i := 0; i < concurrentGet; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + r := rand.New(rand.NewSource(time.Now().UnixNano())) + for { + mark := time.Now() + if mark.Before(until) { + h := c.Get(0, uint64(r.Intn(maxkey)), nil) + latency := int64(time.Now().Sub(mark)) + m := atomic.LoadInt64(&getMaxLatency) + if latency > m { + atomic.CompareAndSwapInt64(&getMaxLatency, m, latency) + } + atomic.AddInt64(&getDuration, latency) + if h != nil { + atomic.AddInt32(&getHit, 1) + h.Release() + } + atomic.AddInt32(&getAll, 1) + } else { + break + } + } + }(i) + } + + wg.Wait() + getAvglatency := time.Duration(getDuration) / time.Duration(getAll) + t.Logf("set=%d getHit=%d getAll=%d getMaxLatency=%v getAvgLatency=%v", + set, getHit, getAll, time.Duration(getMaxLatency), getAvglatency) + + if getAvglatency > delay/3 { + t.Errorf("get avg latency > %v: got=%v", delay/3, getAvglatency) + } +} + +func TestLRUCache_HitMiss(t *testing.T) { cases := []struct { key uint64 value string @@ -36,36 +308,37 @@ func TestCache_HitMiss(t *testing.T) { } setfin := 0 - c := NewLRUCache(1000) - ns := c.GetNamespace(0) + c := NewCache(NewLRU(1000)) for i, x := range cases { - set(ns, x.key, x.value, len(x.value), func() { + set(c, 0, x.key, x.value, len(x.value), func() { setfin++ }).Release() for j, y := range cases { - r, ok := ns.Get(y.key, nil) + h := c.Get(0, y.key, nil) if j <= i { // should hit - if !ok { + if h == nil { t.Errorf("case '%d' iteration '%d' is miss", i, j) - } else if r.Value().(string) != y.value { - t.Errorf("case '%d' iteration '%d' has invalid value got '%s', want '%s'", i, j, r.Value().(string), y.value) + } else { + if x := h.Value().(releaserFunc).value.(string); x != y.value { + t.Errorf("case '%d' iteration '%d' has invalid value got '%s', want '%s'", i, j, x, y.value) + } } } else { // should miss - if ok { - t.Errorf("case '%d' iteration '%d' is hit , value '%s'", i, j, r.Value().(string)) + if h != nil { + t.Errorf("case '%d' iteration '%d' is hit , value '%s'", i, j, h.Value().(releaserFunc).value.(string)) } } - if ok { - r.Release() + if h != nil { + h.Release() } } } for i, x := range cases { finalizerOk := false - ns.Delete(x.key, func(exist bool) { + c.Delete(0, x.key, func() { finalizerOk = true }) @@ -74,22 +347,24 @@ func TestCache_HitMiss(t *testing.T) { } for j, y := range cases { - r, ok := ns.Get(y.key, nil) + h := c.Get(0, y.key, nil) if j > i { // should hit - if !ok { + if h == nil { t.Errorf("case '%d' iteration '%d' is miss", i, j) - } else if r.Value().(string) != y.value { - t.Errorf("case '%d' iteration '%d' has invalid value got '%s', want '%s'", i, j, r.Value().(string), y.value) + } else { + if x := h.Value().(releaserFunc).value.(string); x != y.value { + t.Errorf("case '%d' iteration '%d' has invalid value got '%s', want '%s'", i, j, x, y.value) + } } } else { // should miss - if ok { - t.Errorf("case '%d' iteration '%d' is hit, value '%s'", i, j, r.Value().(string)) + if h != nil { + t.Errorf("case '%d' iteration '%d' is hit, value '%s'", i, j, h.Value().(releaserFunc).value.(string)) } } - if ok { - r.Release() + if h != nil { + h.Release() } } } @@ -100,137 +375,180 @@ func TestCache_HitMiss(t *testing.T) { } func TestLRUCache_Eviction(t *testing.T) { - c := NewLRUCache(12) - ns := c.GetNamespace(0) - o1 := set(ns, 1, 1, 1, nil) - set(ns, 2, 2, 1, nil).Release() - set(ns, 3, 3, 1, nil).Release() - set(ns, 4, 4, 1, nil).Release() - set(ns, 5, 5, 1, nil).Release() - if r, ok := ns.Get(2, nil); ok { // 1,3,4,5,2 - r.Release() - } - set(ns, 9, 9, 10, nil).Release() // 5,2,9 - - for _, x := range []uint64{9, 2, 5, 1} { - r, ok := ns.Get(x, nil) - if !ok { - t.Errorf("miss for key '%d'", x) + c := NewCache(NewLRU(12)) + o1 := set(c, 0, 1, 1, 1, nil) + set(c, 0, 2, 2, 1, nil).Release() + set(c, 0, 3, 3, 1, nil).Release() + set(c, 0, 4, 4, 1, nil).Release() + set(c, 0, 5, 5, 1, nil).Release() + if h := c.Get(0, 2, nil); h != nil { // 1,3,4,5,2 + h.Release() + } + set(c, 0, 9, 9, 10, nil).Release() // 5,2,9 + + for _, key := range []uint64{9, 2, 5, 1} { + h := c.Get(0, key, nil) + if h == nil { + t.Errorf("miss for key '%d'", key) } else { - if r.Value().(int) != int(x) { - t.Errorf("invalid value for key '%d' want '%d', got '%d'", x, x, r.Value().(int)) + if x := h.Value().(int); x != int(key) { + t.Errorf("invalid value for key '%d' want '%d', got '%d'", key, key, x) } - r.Release() + h.Release() } } o1.Release() - for _, x := range []uint64{1, 2, 5} { - r, ok := ns.Get(x, nil) - if !ok { - t.Errorf("miss for key '%d'", x) + for _, key := range []uint64{1, 2, 5} { + h := c.Get(0, key, nil) + if h == nil { + t.Errorf("miss for key '%d'", key) } else { - if r.Value().(int) != int(x) { - t.Errorf("invalid value for key '%d' want '%d', got '%d'", x, x, r.Value().(int)) + if x := h.Value().(int); x != int(key) { + t.Errorf("invalid value for key '%d' want '%d', got '%d'", key, key, x) } - r.Release() + h.Release() } } - for _, x := range []uint64{3, 4, 9} { - r, ok := ns.Get(x, nil) - if ok { - t.Errorf("hit for key '%d'", x) - if r.Value().(int) != int(x) { - t.Errorf("invalid value for key '%d' want '%d', got '%d'", x, x, r.Value().(int)) + for _, key := range []uint64{3, 4, 9} { + h := c.Get(0, key, nil) + if h != nil { + t.Errorf("hit for key '%d'", key) + if x := h.Value().(int); x != int(key) { + t.Errorf("invalid value for key '%d' want '%d', got '%d'", key, key, x) } - r.Release() + h.Release() } } } -func TestLRUCache_SetGet(t *testing.T) { - c := NewLRUCache(13) - ns := c.GetNamespace(0) - for i := 0; i < 200; i++ { - n := uint64(rand.Intn(99999) % 20) - set(ns, n, n, 1, nil).Release() - if p, ok := ns.Get(n, nil); ok { - if p.Value() == nil { - t.Errorf("key '%d' contains nil value", n) +func TestLRUCache_Evict(t *testing.T) { + c := NewCache(NewLRU(6)) + set(c, 0, 1, 1, 1, nil).Release() + set(c, 0, 2, 2, 1, nil).Release() + set(c, 1, 1, 4, 1, nil).Release() + set(c, 1, 2, 5, 1, nil).Release() + set(c, 2, 1, 6, 1, nil).Release() + set(c, 2, 2, 7, 1, nil).Release() + + for ns := 0; ns < 3; ns++ { + for key := 1; key < 3; key++ { + if h := c.Get(uint64(ns), uint64(key), nil); h != nil { + h.Release() } else { - got := p.Value().(uint64) - if got != n { - t.Errorf("invalid value for key '%d' want '%d', got '%d'", n, n, got) - } + t.Errorf("Cache.Get on #%d.%d return nil", ns, key) } - p.Release() - } else { - t.Errorf("key '%d' doesn't exist", n) } } -} -func TestLRUCache_Purge(t *testing.T) { - c := NewLRUCache(3) - ns1 := c.GetNamespace(0) - o1 := set(ns1, 1, 1, 1, nil) - o2 := set(ns1, 2, 2, 1, nil) - ns1.Purge(nil) - set(ns1, 3, 3, 1, nil).Release() - for _, x := range []uint64{1, 2, 3} { - r, ok := ns1.Get(x, nil) - if !ok { - t.Errorf("miss for key '%d'", x) - } else { - if r.Value().(int) != int(x) { - t.Errorf("invalid value for key '%d' want '%d', got '%d'", x, x, r.Value().(int)) + if ok := c.Evict(0, 1); !ok { + t.Error("first Cache.Evict on #0.1 return false") + } + if ok := c.Evict(0, 1); ok { + t.Error("second Cache.Evict on #0.1 return true") + } + if h := c.Get(0, 1, nil); h != nil { + t.Errorf("Cache.Get on #0.1 return non-nil: %v", h.Value()) + } + + c.EvictNS(1) + if h := c.Get(1, 1, nil); h != nil { + t.Errorf("Cache.Get on #1.1 return non-nil: %v", h.Value()) + } + if h := c.Get(1, 2, nil); h != nil { + t.Errorf("Cache.Get on #1.2 return non-nil: %v", h.Value()) + } + + c.EvictAll() + for ns := 0; ns < 3; ns++ { + for key := 1; key < 3; key++ { + if h := c.Get(uint64(ns), uint64(key), nil); h != nil { + t.Errorf("Cache.Get on #%d.%d return non-nil: %v", ns, key, h.Value()) } - r.Release() } } - o1.Release() - o2.Release() - for _, x := range []uint64{1, 2} { - r, ok := ns1.Get(x, nil) - if ok { - t.Errorf("hit for key '%d'", x) - if r.Value().(int) != int(x) { - t.Errorf("invalid value for key '%d' want '%d', got '%d'", x, x, r.Value().(int)) - } - r.Release() +} + +func TestLRUCache_Delete(t *testing.T) { + delFuncCalled := 0 + delFunc := func() { + delFuncCalled++ + } + + c := NewCache(NewLRU(2)) + set(c, 0, 1, 1, 1, nil).Release() + set(c, 0, 2, 2, 1, nil).Release() + + if ok := c.Delete(0, 1, delFunc); !ok { + t.Error("Cache.Delete on #1 return false") + } + if h := c.Get(0, 1, nil); h != nil { + t.Errorf("Cache.Get on #1 return non-nil: %v", h.Value()) + } + if ok := c.Delete(0, 1, delFunc); ok { + t.Error("Cache.Delete on #1 return true") + } + + h2 := c.Get(0, 2, nil) + if h2 == nil { + t.Error("Cache.Get on #2 return nil") + } + if ok := c.Delete(0, 2, delFunc); !ok { + t.Error("(1) Cache.Delete on #2 return false") + } + if ok := c.Delete(0, 2, delFunc); !ok { + t.Error("(2) Cache.Delete on #2 return false") + } + + set(c, 0, 3, 3, 1, nil).Release() + set(c, 0, 4, 4, 1, nil).Release() + c.Get(0, 2, nil).Release() + + for key := 2; key <= 4; key++ { + if h := c.Get(0, uint64(key), nil); h != nil { + h.Release() + } else { + t.Errorf("Cache.Get on #%d return nil", key) } } -} -func BenchmarkLRUCache_SetRelease(b *testing.B) { - capacity := b.N / 100 - if capacity <= 0 { - capacity = 10 + h2.Release() + if h := c.Get(0, 2, nil); h != nil { + t.Errorf("Cache.Get on #2 return non-nil: %v", h.Value()) } - c := NewLRUCache(capacity) - ns := c.GetNamespace(0) - b.ResetTimer() - for i := uint64(0); i < uint64(b.N); i++ { - set(ns, i, nil, 1, nil).Release() + + if delFuncCalled != 4 { + t.Errorf("delFunc isn't called 4 times: got=%d", delFuncCalled) } } -func BenchmarkLRUCache_SetReleaseTwice(b *testing.B) { - capacity := b.N / 100 - if capacity <= 0 { - capacity = 10 +func TestLRUCache_Close(t *testing.T) { + relFuncCalled := 0 + relFunc := func() { + relFuncCalled++ + } + delFuncCalled := 0 + delFunc := func() { + delFuncCalled++ } - c := NewLRUCache(capacity) - ns := c.GetNamespace(0) - b.ResetTimer() - na := b.N / 2 - nb := b.N - na + c := NewCache(NewLRU(2)) + set(c, 0, 1, 1, 1, relFunc).Release() + set(c, 0, 2, 2, 1, relFunc).Release() - for i := uint64(0); i < uint64(na); i++ { - set(ns, i, nil, 1, nil).Release() + h3 := set(c, 0, 3, 3, 1, relFunc) + if h3 == nil { + t.Error("Cache.Get on #3 return nil") } + if ok := c.Delete(0, 3, delFunc); !ok { + t.Error("Cache.Delete on #3 return false") + } + + c.Close() - for i := uint64(0); i < uint64(nb); i++ { - set(ns, i, nil, 1, nil).Release() + if relFuncCalled != 3 { + t.Errorf("relFunc isn't called 3 times: got=%d", relFuncCalled) + } + if delFuncCalled != 1 { + t.Errorf("delFunc isn't called 1 times: got=%d", delFuncCalled) } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/empty_cache.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/empty_cache.go deleted file mode 100644 index 1fbf81459..000000000 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/empty_cache.go +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright (c) 2013, Suryandaru Triandana -// All rights reserved. -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -package cache - -import ( - "sync" - "sync/atomic" -) - -type emptyCache struct { - sync.Mutex - table map[uint64]*emptyNS -} - -// NewEmptyCache creates a new initialized empty cache. -func NewEmptyCache() Cache { - return &emptyCache{ - table: make(map[uint64]*emptyNS), - } -} - -func (c *emptyCache) GetNamespace(id uint64) Namespace { - c.Lock() - defer c.Unlock() - - if ns, ok := c.table[id]; ok { - return ns - } - - ns := &emptyNS{ - cache: c, - id: id, - table: make(map[uint64]*emptyNode), - } - c.table[id] = ns - return ns -} - -func (c *emptyCache) Purge(fin PurgeFin) { - c.Lock() - for _, ns := range c.table { - ns.purgeNB(fin) - } - c.Unlock() -} - -func (c *emptyCache) Zap(closed bool) { - c.Lock() - for _, ns := range c.table { - ns.zapNB(closed) - } - c.table = make(map[uint64]*emptyNS) - c.Unlock() -} - -func (*emptyCache) SetCapacity(capacity int) {} - -type emptyNS struct { - cache *emptyCache - id uint64 - table map[uint64]*emptyNode - state nsState -} - -func (ns *emptyNS) Get(key uint64, setf SetFunc) (o Object, ok bool) { - ns.cache.Lock() - - switch ns.state { - case nsZapped: - ns.cache.Unlock() - if setf == nil { - return - } - - var value interface{} - var fin func() - ok, value, _, fin = setf() - if ok { - o = &fakeObject{ - value: value, - fin: fin, - } - } - return - case nsClosed: - ns.cache.Unlock() - return - } - - n, ok := ns.table[key] - if ok { - n.ref++ - } else { - if setf == nil { - ns.cache.Unlock() - return - } - - var value interface{} - var fin func() - ok, value, _, fin = setf() - if !ok { - ns.cache.Unlock() - return - } - - n = &emptyNode{ - ns: ns, - key: key, - value: value, - setfin: fin, - ref: 1, - } - ns.table[key] = n - } - - ns.cache.Unlock() - o = &emptyObject{node: n} - return -} - -func (ns *emptyNS) Delete(key uint64, fin DelFin) bool { - ns.cache.Lock() - - if ns.state != nsEffective { - ns.cache.Unlock() - if fin != nil { - fin(false) - } - return false - } - - n, ok := ns.table[key] - if !ok { - ns.cache.Unlock() - if fin != nil { - fin(false) - } - return false - } - n.delfin = fin - ns.cache.Unlock() - return true -} - -func (ns *emptyNS) purgeNB(fin PurgeFin) { - if ns.state != nsEffective { - return - } - for _, n := range ns.table { - n.purgefin = fin - } -} - -func (ns *emptyNS) Purge(fin PurgeFin) { - ns.cache.Lock() - ns.purgeNB(fin) - ns.cache.Unlock() -} - -func (ns *emptyNS) zapNB(closed bool) { - if ns.state != nsEffective { - return - } - for _, n := range ns.table { - n.execFin() - } - if closed { - ns.state = nsClosed - } else { - ns.state = nsZapped - } - ns.table = nil -} - -func (ns *emptyNS) Zap(closed bool) { - ns.cache.Lock() - ns.zapNB(closed) - delete(ns.cache.table, ns.id) - ns.cache.Unlock() -} - -type emptyNode struct { - ns *emptyNS - key uint64 - value interface{} - ref int - setfin SetFin - delfin DelFin - purgefin PurgeFin -} - -func (n *emptyNode) execFin() { - if n.setfin != nil { - n.setfin() - n.setfin = nil - } - if n.purgefin != nil { - n.purgefin(n.ns.id, n.key, n.delfin) - n.delfin = nil - n.purgefin = nil - } else if n.delfin != nil { - n.delfin(true) - n.delfin = nil - } -} - -func (n *emptyNode) evict() { - n.ns.cache.Lock() - n.ref-- - if n.ref == 0 { - if n.ns.state == nsEffective { - // Remove elem. - delete(n.ns.table, n.key) - // Execute finalizer. - n.execFin() - } - } else if n.ref < 0 { - panic("leveldb/cache: emptyNode: negative node reference") - } - n.ns.cache.Unlock() -} - -type emptyObject struct { - node *emptyNode - once uint32 -} - -func (o *emptyObject) Value() interface{} { - if atomic.LoadUint32(&o.once) == 0 { - return o.node.value - } - return nil -} - -func (o *emptyObject) Release() { - if !atomic.CompareAndSwapUint32(&o.once, 0, 1) { - return - } - o.node.evict() - o.node = nil -} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/lru.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/lru.go new file mode 100644 index 000000000..d9a84cde1 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/lru.go @@ -0,0 +1,195 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package cache + +import ( + "sync" + "unsafe" +) + +type lruNode struct { + n *Node + h *Handle + ban bool + + next, prev *lruNode +} + +func (n *lruNode) insert(at *lruNode) { + x := at.next + at.next = n + n.prev = at + n.next = x + x.prev = n +} + +func (n *lruNode) remove() { + if n.prev != nil { + n.prev.next = n.next + n.next.prev = n.prev + n.prev = nil + n.next = nil + } else { + panic("BUG: removing removed node") + } +} + +type lru struct { + mu sync.Mutex + capacity int + used int + recent lruNode +} + +func (r *lru) reset() { + r.recent.next = &r.recent + r.recent.prev = &r.recent + r.used = 0 +} + +func (r *lru) Capacity() int { + r.mu.Lock() + defer r.mu.Unlock() + return r.capacity +} + +func (r *lru) SetCapacity(capacity int) { + var evicted []*lruNode + + r.mu.Lock() + r.capacity = capacity + for r.used > r.capacity { + rn := r.recent.prev + if rn == nil { + panic("BUG: invalid LRU used or capacity counter") + } + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) Promote(n *Node) { + var evicted []*lruNode + + r.mu.Lock() + if n.CacheData == nil { + if n.Size() <= r.capacity { + rn := &lruNode{n: n, h: n.GetHandle()} + rn.insert(&r.recent) + n.CacheData = unsafe.Pointer(rn) + r.used += n.Size() + + for r.used > r.capacity { + rn := r.recent.prev + if rn == nil { + panic("BUG: invalid LRU used or capacity counter") + } + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + } + } else { + rn := (*lruNode)(n.CacheData) + if !rn.ban { + rn.remove() + rn.insert(&r.recent) + } + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) Ban(n *Node) { + r.mu.Lock() + if n.CacheData == nil { + n.CacheData = unsafe.Pointer(&lruNode{n: n, ban: true}) + } else { + rn := (*lruNode)(n.CacheData) + if !rn.ban { + rn.remove() + rn.ban = true + r.used -= rn.n.Size() + r.mu.Unlock() + + rn.h.Release() + rn.h = nil + return + } + } + r.mu.Unlock() +} + +func (r *lru) Evict(n *Node) { + r.mu.Lock() + rn := (*lruNode)(n.CacheData) + if rn == nil || rn.ban { + r.mu.Unlock() + return + } + n.CacheData = nil + r.mu.Unlock() + + rn.h.Release() +} + +func (r *lru) EvictNS(ns uint64) { + var evicted []*lruNode + + r.mu.Lock() + for e := r.recent.prev; e != &r.recent; { + rn := e + e = e.prev + if rn.n.NS() == ns { + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) EvictAll() { + r.mu.Lock() + back := r.recent.prev + for rn := back; rn != &r.recent; rn = rn.prev { + rn.n.CacheData = nil + } + r.reset() + r.mu.Unlock() + + for rn := back; rn != &r.recent; rn = rn.prev { + rn.h.Release() + } +} + +func (r *lru) Close() error { + return nil +} + +// NewLRU create a new LRU-cache. +func NewLRU(capacity int) Cacher { + r := &lru{capacity: capacity} + r.reset() + return r +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/lru_cache.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/lru_cache.go deleted file mode 100644 index 3c98e076b..000000000 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/lru_cache.go +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright (c) 2012, Suryandaru Triandana -// All rights reserved. -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -package cache - -import ( - "sync" - "sync/atomic" -) - -// lruCache represent a LRU cache state. -type lruCache struct { - sync.Mutex - - recent lruNode - table map[uint64]*lruNs - capacity int - size int -} - -// NewLRUCache creates a new initialized LRU cache with the given capacity. -func NewLRUCache(capacity int) Cache { - c := &lruCache{ - table: make(map[uint64]*lruNs), - capacity: capacity, - } - c.recent.rNext = &c.recent - c.recent.rPrev = &c.recent - return c -} - -// SetCapacity set cache capacity. -func (c *lruCache) SetCapacity(capacity int) { - c.Lock() - c.capacity = capacity - c.evict() - c.Unlock() -} - -// GetNamespace return namespace object for given id. -func (c *lruCache) GetNamespace(id uint64) Namespace { - c.Lock() - defer c.Unlock() - - if p, ok := c.table[id]; ok { - return p - } - - p := &lruNs{ - lru: c, - id: id, - table: make(map[uint64]*lruNode), - } - c.table[id] = p - return p -} - -// Purge purge entire cache. -func (c *lruCache) Purge(fin PurgeFin) { - c.Lock() - for _, ns := range c.table { - ns.purgeNB(fin) - } - c.Unlock() -} - -func (c *lruCache) Zap(closed bool) { - c.Lock() - for _, ns := range c.table { - ns.zapNB(closed) - } - c.table = make(map[uint64]*lruNs) - c.Unlock() -} - -func (c *lruCache) evict() { - top := &c.recent - for n := c.recent.rPrev; c.size > c.capacity && n != top; { - n.state = nodeEvicted - n.rRemove() - n.evictNB() - c.size -= n.charge - n = c.recent.rPrev - } -} - -type lruNs struct { - lru *lruCache - id uint64 - table map[uint64]*lruNode - state nsState -} - -func (ns *lruNs) Get(key uint64, setf SetFunc) (o Object, ok bool) { - lru := ns.lru - lru.Lock() - - switch ns.state { - case nsZapped: - lru.Unlock() - if setf == nil { - return - } - - var value interface{} - var fin func() - ok, value, _, fin = setf() - if ok { - o = &fakeObject{ - value: value, - fin: fin, - } - } - return - case nsClosed: - lru.Unlock() - return - } - - n, ok := ns.table[key] - if ok { - switch n.state { - case nodeEvicted: - // Insert to recent list. - n.state = nodeEffective - n.ref++ - lru.size += n.charge - lru.evict() - fallthrough - case nodeEffective: - // Bump to front - n.rRemove() - n.rInsert(&lru.recent) - } - n.ref++ - } else { - if setf == nil { - lru.Unlock() - return - } - - var value interface{} - var charge int - var fin func() - ok, value, charge, fin = setf() - if !ok { - lru.Unlock() - return - } - - n = &lruNode{ - ns: ns, - key: key, - value: value, - charge: charge, - setfin: fin, - ref: 2, - } - ns.table[key] = n - n.rInsert(&lru.recent) - - lru.size += charge - lru.evict() - } - - lru.Unlock() - o = &lruObject{node: n} - return -} - -func (ns *lruNs) Delete(key uint64, fin DelFin) bool { - lru := ns.lru - lru.Lock() - - if ns.state != nsEffective { - lru.Unlock() - if fin != nil { - fin(false) - } - return false - } - - n, ok := ns.table[key] - if !ok { - lru.Unlock() - if fin != nil { - fin(false) - } - return false - } - - n.delfin = fin - switch n.state { - case nodeRemoved: - lru.Unlock() - return false - case nodeEffective: - lru.size -= n.charge - n.rRemove() - n.evictNB() - } - n.state = nodeRemoved - - lru.Unlock() - return true -} - -func (ns *lruNs) purgeNB(fin PurgeFin) { - lru := ns.lru - if ns.state != nsEffective { - return - } - - for _, n := range ns.table { - n.purgefin = fin - if n.state == nodeEffective { - lru.size -= n.charge - n.rRemove() - n.evictNB() - } - n.state = nodeRemoved - } -} - -func (ns *lruNs) Purge(fin PurgeFin) { - ns.lru.Lock() - ns.purgeNB(fin) - ns.lru.Unlock() -} - -func (ns *lruNs) zapNB(closed bool) { - lru := ns.lru - if ns.state != nsEffective { - return - } - - if closed { - ns.state = nsClosed - } else { - ns.state = nsZapped - } - for _, n := range ns.table { - if n.state == nodeEffective { - lru.size -= n.charge - n.rRemove() - } - n.state = nodeRemoved - n.execFin() - } - ns.table = nil -} - -func (ns *lruNs) Zap(closed bool) { - ns.lru.Lock() - ns.zapNB(closed) - delete(ns.lru.table, ns.id) - ns.lru.Unlock() -} - -type lruNode struct { - ns *lruNs - - rNext, rPrev *lruNode - - key uint64 - value interface{} - charge int - ref int - state nodeState - setfin SetFin - delfin DelFin - purgefin PurgeFin -} - -func (n *lruNode) rInsert(at *lruNode) { - x := at.rNext - at.rNext = n - n.rPrev = at - n.rNext = x - x.rPrev = n -} - -func (n *lruNode) rRemove() bool { - // only remove if not already removed - if n.rPrev == nil { - return false - } - - n.rPrev.rNext = n.rNext - n.rNext.rPrev = n.rPrev - n.rPrev = nil - n.rNext = nil - - return true -} - -func (n *lruNode) execFin() { - if n.setfin != nil { - n.setfin() - n.setfin = nil - } - if n.purgefin != nil { - n.purgefin(n.ns.id, n.key, n.delfin) - n.delfin = nil - n.purgefin = nil - } else if n.delfin != nil { - n.delfin(true) - n.delfin = nil - } -} - -func (n *lruNode) evictNB() { - n.ref-- - if n.ref == 0 { - if n.ns.state == nsEffective { - // remove elem - delete(n.ns.table, n.key) - // execute finalizer - n.execFin() - } - } else if n.ref < 0 { - panic("leveldb/cache: lruCache: negative node reference") - } -} - -func (n *lruNode) evict() { - n.ns.lru.Lock() - n.evictNB() - n.ns.lru.Unlock() -} - -type lruObject struct { - node *lruNode - once uint32 -} - -func (o *lruObject) Value() interface{} { - if atomic.LoadUint32(&o.once) == 0 { - return o.node.value - } - return nil -} - -func (o *lruObject) Release() { - if !atomic.CompareAndSwapUint32(&o.once, 0, 1) { - return - } - - o.node.evict() - o.node = nil -} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go deleted file mode 100644 index 511058897..000000000 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2012, Suryandaru Triandana -// All rights reserved. -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -package leveldb - -const ( - kNumLevels = 7 - - // Level-0 compaction is started when we hit this many files. - kL0_CompactionTrigger float64 = 4 - - // Soft limit on number of level-0 files. We slow down writes at this point. - kL0_SlowdownWritesTrigger = 8 - - // Maximum number of level-0 files. We stop writes at this point. - kL0_StopWritesTrigger = 12 - - // Maximum level to which a new compacted memdb is pushed if it - // does not create overlap. We try to push to level 2 to avoid the - // relatively expensive level 0=>1 compactions and to avoid some - // expensive manifest file operations. We do not push all the way to - // the largest level since that can generate a lot of wasted disk - // space if the same key space is being repeatedly overwritten. - kMaxMemCompactLevel = 2 - - // Maximum size of a table. - kMaxTableSize = 2 * 1048576 - - // Maximum bytes of overlaps in grandparent (i.e., level+2) before we - // stop building a single file in a level->level+1 compaction. - kMaxGrandParentOverlapBytes = 10 * kMaxTableSize - - // Maximum number of bytes in all compacted files. We avoid expanding - // the lower level file set of a compaction if it would make the - // total compaction cover more than this many bytes. - kExpCompactionMaxBytes = 25 * kMaxTableSize -) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/corrupt_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/corrupt_test.go index a036e0893..a351874ed 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/corrupt_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/corrupt_test.go @@ -9,13 +9,12 @@ package leveldb import ( "bytes" "fmt" + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" "io" "math/rand" "testing" - - "github.com/syndtr/goleveldb/leveldb/cache" - "github.com/syndtr/goleveldb/leveldb/opt" - "github.com/syndtr/goleveldb/leveldb/storage" ) const ctValSize = 1000 @@ -32,8 +31,8 @@ func newDbCorruptHarnessWopt(t *testing.T, o *opt.Options) *dbCorruptHarness { func newDbCorruptHarness(t *testing.T) *dbCorruptHarness { return newDbCorruptHarnessWopt(t, &opt.Options{ - BlockCache: cache.NewLRUCache(100), - Strict: opt.StrictJournalChecksum, + BlockCacheCapacity: 100, + Strict: opt.StrictJournalChecksum, }) } @@ -96,21 +95,22 @@ func (h *dbCorruptHarness) deleteRand(n, max int, rnd *rand.Rand) { } } -func (h *dbCorruptHarness) corrupt(ft storage.FileType, offset, n int) { +func (h *dbCorruptHarness) corrupt(ft storage.FileType, fi, offset, n int) { p := &h.dbHarness t := p.t - var file storage.File ff, _ := p.stor.GetFiles(ft) - for _, f := range ff { - if file == nil || f.Num() > file.Num() { - file = f - } + sff := files(ff) + sff.sort() + if fi < 0 { + fi = len(sff) - 1 } - if file == nil { - t.Fatalf("no such file with type %q", ft) + if fi >= len(sff) { + t.Fatalf("no such file with type %q with index %d", ft, fi) } + file := sff[fi] + r, err := file.Open() if err != nil { t.Fatal("cannot open file: ", err) @@ -225,8 +225,8 @@ func TestCorruptDB_Journal(t *testing.T) { h.build(100) h.check(100, 100) h.closeDB() - h.corrupt(storage.TypeJournal, 19, 1) - h.corrupt(storage.TypeJournal, 32*1024+1000, 1) + h.corrupt(storage.TypeJournal, -1, 19, 1) + h.corrupt(storage.TypeJournal, -1, 32*1024+1000, 1) h.openDB() h.check(36, 36) @@ -242,7 +242,7 @@ func TestCorruptDB_Table(t *testing.T) { h.compactRangeAt(0, "", "") h.compactRangeAt(1, "", "") h.closeDB() - h.corrupt(storage.TypeTable, 100, 1) + h.corrupt(storage.TypeTable, -1, 100, 1) h.openDB() h.check(99, 99) @@ -256,7 +256,7 @@ func TestCorruptDB_TableIndex(t *testing.T) { h.build(10000) h.compactMem() h.closeDB() - h.corrupt(storage.TypeTable, -2000, 500) + h.corrupt(storage.TypeTable, -1, -2000, 500) h.openDB() h.check(5000, 9999) @@ -267,9 +267,9 @@ func TestCorruptDB_TableIndex(t *testing.T) { func TestCorruptDB_MissingManifest(t *testing.T) { rnd := rand.New(rand.NewSource(0x0badda7a)) h := newDbCorruptHarnessWopt(t, &opt.Options{ - BlockCache: cache.NewLRUCache(100), - Strict: opt.StrictJournalChecksum, - WriteBuffer: 1000 * 60, + BlockCacheCapacity: 100, + Strict: opt.StrictJournalChecksum, + WriteBuffer: 1000 * 60, }) h.build(1000) @@ -355,7 +355,7 @@ func TestCorruptDB_CorruptedManifest(t *testing.T) { h.compactMem() h.compactRange("", "") h.closeDB() - h.corrupt(storage.TypeManifest, 0, 1000) + h.corrupt(storage.TypeManifest, -1, 0, 1000) h.openAssert(false) h.recover() @@ -370,7 +370,7 @@ func TestCorruptDB_CompactionInputError(t *testing.T) { h.build(10) h.compactMem() h.closeDB() - h.corrupt(storage.TypeTable, 100, 1) + h.corrupt(storage.TypeTable, -1, 100, 1) h.openDB() h.check(9, 9) @@ -387,7 +387,7 @@ func TestCorruptDB_UnrelatedKeys(t *testing.T) { h.build(10) h.compactMem() h.closeDB() - h.corrupt(storage.TypeTable, 100, 1) + h.corrupt(storage.TypeTable, -1, 100, 1) h.openDB() h.put(string(tkey(1000)), string(tval(1000, ctValSize))) @@ -470,3 +470,31 @@ func TestCorruptDB_MissingTableFiles(t *testing.T) { h.close() } + +func TestCorruptDB_RecoverTable(t *testing.T) { + h := newDbCorruptHarnessWopt(t, &opt.Options{ + WriteBuffer: 112 * opt.KiB, + CompactionTableSize: 90 * opt.KiB, + Filter: filter.NewBloomFilter(10), + }) + + h.build(1000) + h.compactMem() + h.compactRangeAt(0, "", "") + h.compactRangeAt(1, "", "") + seq := h.db.seq + h.closeDB() + h.corrupt(storage.TypeTable, 0, 1000, 1) + h.corrupt(storage.TypeTable, 3, 10000, 1) + // Corrupted filter shouldn't affect recovery. + h.corrupt(storage.TypeTable, 3, 113888, 10) + h.corrupt(storage.TypeTable, -1, 20000, 1) + + h.recover() + if h.db.seq != seq { + t.Errorf("invalid seq, want=%d got=%d", seq, h.db.seq) + } + h.check(985, 985) + + h.close() +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go index 8e975dc63..323353b2a 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go @@ -7,15 +7,17 @@ package leveldb import ( - "errors" + "container/list" "fmt" "io" "os" "runtime" "strings" "sync" + "sync/atomic" "time" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/memdb" @@ -30,41 +32,46 @@ type DB struct { // Need 64-bit alignment. seq uint64 + // Session. s *session - // MemDB + // MemDB. memMu sync.RWMutex - mem *memdb.DB - frozenMem *memdb.DB + memPool chan *memdb.DB + mem, frozenMem *memDB journal *journal.Writer journalWriter storage.Writer journalFile storage.File frozenJournalFile storage.File frozenSeq uint64 - // Snapshot + // Snapshot. snapsMu sync.Mutex - snapsRoot snapshotElement + snapsList *list.List - // Write + // Stats. + aliveSnaps, aliveIters int32 + + // Write. writeC chan *Batch writeMergedC chan bool writeLockC chan struct{} writeAckC chan error + writeDelay time.Duration + writeDelayN int journalC chan *Batch journalAckC chan error - // Compaction - tcompCmdC chan cCmd - tcompPauseC chan chan<- struct{} - tcompTriggerC chan struct{} - mcompCmdC chan cCmd - mcompTriggerC chan struct{} - compErrC chan error - compErrSetC chan error - compStats [kNumLevels]cStats - - // Close + // Compaction. + tcompCmdC chan cCmd + tcompPauseC chan chan<- struct{} + mcompCmdC chan cCmd + compErrC chan error + compPerErrC chan error + compErrSetC chan error + compStats []cStats + + // Close. closeW sync.WaitGroup closeC chan struct{} closed uint32 @@ -77,7 +84,11 @@ func openDB(s *session) (*DB, error) { db := &DB{ s: s, // Initial sequence - seq: s.stSeq, + seq: s.stSeqNum, + // MemDB + memPool: make(chan *memdb.DB, 1), + // Snapshot + snapsList: list.New(), // Write writeC: make(chan *Batch), writeMergedC: make(chan bool), @@ -86,17 +97,16 @@ func openDB(s *session) (*DB, error) { journalC: make(chan *Batch), journalAckC: make(chan error), // Compaction - tcompCmdC: make(chan cCmd), - tcompPauseC: make(chan chan<- struct{}), - tcompTriggerC: make(chan struct{}, 1), - mcompCmdC: make(chan cCmd), - mcompTriggerC: make(chan struct{}, 1), - compErrC: make(chan error), - compErrSetC: make(chan error), + tcompCmdC: make(chan cCmd), + tcompPauseC: make(chan chan<- struct{}), + mcompCmdC: make(chan cCmd), + compErrC: make(chan error), + compPerErrC: make(chan error), + compErrSetC: make(chan error), + compStats: make([]cStats, s.o.GetNumLevel()), // Close closeC: make(chan struct{}), } - db.initSnapshot() if err := db.recoverJournal(); err != nil { return nil, err @@ -112,8 +122,9 @@ func openDB(s *session) (*DB, error) { return nil, err } - // Don't include compaction error goroutine into wait group. + // Doesn't need to be included in the wait group. go db.compactionError() + go db.mpoolDrain() db.closeW.Add(3) go db.tCompaction() @@ -135,9 +146,10 @@ func openDB(s *session) (*DB, error) { // detected in the DB. Corrupted DB can be recovered with Recover // function. // +// The returned DB instance is goroutine-safe. // The DB must be closed after use, by calling Close method. -func Open(p storage.Storage, o *opt.Options) (db *DB, err error) { - s, err := newSession(p, o) +func Open(stor storage.Storage, o *opt.Options) (db *DB, err error) { + s, err := newSession(stor, o) if err != nil { return } @@ -177,6 +189,7 @@ func Open(p storage.Storage, o *opt.Options) (db *DB, err error) { // detected in the DB. Corrupted DB can be recovered with Recover // function. // +// The returned DB instance is goroutine-safe. // The DB must be closed after use, by calling Close method. func OpenFile(path string, o *opt.Options) (db *DB, err error) { stor, err := storage.OpenFile(path) @@ -197,9 +210,10 @@ func OpenFile(path string, o *opt.Options) (db *DB, err error) { // The DB must already exist or it will returns an error. // Also, Recover will ignore ErrorIfMissing and ErrorIfExist options. // +// The returned DB instance is goroutine-safe. // The DB must be closed after use, by calling Close method. -func Recover(p storage.Storage, o *opt.Options) (db *DB, err error) { - s, err := newSession(p, o) +func Recover(stor storage.Storage, o *opt.Options) (db *DB, err error) { + s, err := newSession(stor, o) if err != nil { return } @@ -225,6 +239,7 @@ func Recover(p storage.Storage, o *opt.Options) (db *DB, err error) { // RecoverFile uses standard file-system backed storage implementation as desribed // in the leveldb/storage package. // +// The returned DB instance is goroutine-safe. // The DB must be closed after use, by calling Close method. func RecoverFile(path string, o *opt.Options) (db *DB, err error) { stor, err := storage.OpenFile(path) @@ -241,16 +256,28 @@ func RecoverFile(path string, o *opt.Options) (db *DB, err error) { } func recoverTable(s *session, o *opt.Options) error { - ff0, err := s.getFiles(storage.TypeTable) + o = dupOptions(o) + // Mask StrictReader, lets StrictRecovery doing its job. + o.Strict &= ^opt.StrictReader + + // Get all tables and sort it by file number. + tableFiles_, err := s.getFiles(storage.TypeTable) if err != nil { return err } - ff1 := files(ff0) - ff1.sort() + tableFiles := files(tableFiles_) + tableFiles.sort() - var mSeq uint64 - var good, corrupted int - rec := new(sessionRecord) + var ( + maxSeq uint64 + recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int + + // We will drop corrupted table. + strict = o.GetStrict(opt.StrictRecovery) + + rec = &sessionRecord{numLevel: o.GetNumLevel()} + bpool = util.NewBufferPool(o.GetBlockSize() + 5) + ) buildTable := func(iter iterator.Iterator) (tmp storage.File, size int64, err error) { tmp = s.newTemp() writer, err := tmp.Create() @@ -264,8 +291,9 @@ func recoverTable(s *session, o *opt.Options) error { tmp = nil } }() + + // Copy entries. tw := table.NewWriter(writer, o) - // Copy records. for iter.Next() { key := iter.Key() if validIkey(key) { @@ -296,45 +324,73 @@ func recoverTable(s *session, o *opt.Options) error { if err != nil { return err } - defer reader.Close() + var closed bool + defer func() { + if !closed { + reader.Close() + } + }() + // Get file size. size, err := reader.Seek(0, 2) if err != nil { return err } - var tSeq uint64 - var tgood, tcorrupted, blockerr int - var min, max []byte - tr := table.NewReader(reader, size, nil, o) + + var ( + tSeq uint64 + tgoodKey, tcorruptedKey, tcorruptedBlock int + imin, imax []byte + ) + tr, err := table.NewReader(reader, size, storage.NewFileInfo(file), nil, bpool, o) + if err != nil { + return err + } iter := tr.NewIterator(nil, nil) - iter.(iterator.ErrorCallbackSetter).SetErrorCallback(func(err error) { - s.logf("table@recovery found error @%d %q", file.Num(), err) - blockerr++ - }) + if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok { + itererr.SetErrorCallback(func(err error) { + if errors.IsCorrupted(err) { + s.logf("table@recovery block corruption @%d %q", file.Num(), err) + tcorruptedBlock++ + } + }) + } + // Scan the table. for iter.Next() { key := iter.Key() - _, seq, _, ok := parseIkey(key) - if !ok { - tcorrupted++ + _, seq, _, kerr := parseIkey(key) + if kerr != nil { + tcorruptedKey++ continue } - tgood++ + tgoodKey++ if seq > tSeq { tSeq = seq } - if min == nil { - min = append([]byte{}, key...) + if imin == nil { + imin = append([]byte{}, key...) } - max = append(max[:0], key...) + imax = append(imax[:0], key...) } if err := iter.Error(); err != nil { iter.Release() return err } iter.Release() - if tgood > 0 { - if tcorrupted > 0 || blockerr > 0 { + + goodKey += tgoodKey + corruptedKey += tcorruptedKey + corruptedBlock += tcorruptedBlock + + if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) { + droppedTable++ + s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) + return nil + } + + if tgoodKey > 0 { + if tcorruptedKey > 0 || tcorruptedBlock > 0 { // Rebuild the table. s.logf("table@recovery rebuilding @%d", file.Num()) iter := tr.NewIterator(nil, nil) @@ -343,62 +399,77 @@ func recoverTable(s *session, o *opt.Options) error { if err != nil { return err } + closed = true reader.Close() if err := file.Replace(tmp); err != nil { return err } size = newSize } - if tSeq > mSeq { - mSeq = tSeq + if tSeq > maxSeq { + maxSeq = tSeq } + recoveredKey += tgoodKey // Add table to level 0. - rec.addTable(0, file.Num(), uint64(size), min, max) - s.logf("table@recovery recovered @%d N·%d C·%d B·%d S·%d Q·%d", file.Num(), tgood, tcorrupted, blockerr, size, tSeq) + rec.addTable(0, file.Num(), uint64(size), imin, imax) + s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) } else { - s.logf("table@recovery unrecoverable @%d C·%d B·%d S·%d", file.Num(), tcorrupted, blockerr, size) + droppedTable++ + s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", file.Num(), tcorruptedKey, tcorruptedBlock, size) } - good += tgood - corrupted += tcorrupted - return nil } + // Recover all tables. - if len(ff1) > 0 { - s.logf("table@recovery F·%d", len(ff1)) - s.markFileNum(ff1[len(ff1)-1].Num()) - for _, file := range ff1 { + if len(tableFiles) > 0 { + s.logf("table@recovery F·%d", len(tableFiles)) + + // Mark file number as used. + s.markFileNum(tableFiles[len(tableFiles)-1].Num()) + + for _, file := range tableFiles { if err := recoverTable(file); err != nil { return err } } - s.logf("table@recovery recovered F·%d N·%d C·%d Q·%d", len(ff1), good, corrupted, mSeq) + + s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(tableFiles), recoveredKey, goodKey, corruptedKey, maxSeq) } + // Set sequence number. - rec.setSeq(mSeq + 1) + rec.setSeqNum(maxSeq) + // Create new manifest. if err := s.create(); err != nil { return err } + // Commit. return s.commit(rec) } -func (d *DB) recoverJournal() error { - s := d.s - - ff0, err := s.getFiles(storage.TypeJournal) +func (db *DB) recoverJournal() error { + // Get all tables and sort it by file number. + journalFiles_, err := db.s.getFiles(storage.TypeJournal) if err != nil { return err } - ff1 := files(ff0) - ff1.sort() - ff2 := make([]storage.File, 0, len(ff1)) - for _, file := range ff1 { - if file.Num() >= s.stJournalNum || file.Num() == s.stPrevJournalNum { - s.markFileNum(file.Num()) - ff2 = append(ff2, file) + journalFiles := files(journalFiles_) + journalFiles.sort() + + // Discard older journal. + prev := -1 + for i, file := range journalFiles { + if file.Num() >= db.s.stJournalNum { + if prev >= 0 { + i-- + journalFiles[i] = journalFiles[prev] + } + journalFiles = journalFiles[i:] + break + } else if file.Num() == db.s.stPrevJournalNum { + prev = i } } @@ -406,38 +477,43 @@ func (d *DB) recoverJournal() error { var of storage.File var mem *memdb.DB batch := new(Batch) - cm := newCMem(s) + cm := newCMem(db.s) buf := new(util.Buffer) // Options. - strict := s.o.GetStrict(opt.StrictJournal) - checksum := s.o.GetStrict(opt.StrictJournalChecksum) - writeBuffer := s.o.GetWriteBuffer() + strict := db.s.o.GetStrict(opt.StrictJournal) + checksum := db.s.o.GetStrict(opt.StrictJournalChecksum) + writeBuffer := db.s.o.GetWriteBuffer() recoverJournal := func(file storage.File) error { - s.logf("journal@recovery recovering @%d", file.Num()) + db.logf("journal@recovery recovering @%d", file.Num()) reader, err := file.Open() if err != nil { return err } defer reader.Close() + + // Create/reset journal reader instance. if jr == nil { - jr = journal.NewReader(reader, dropper{s, file}, strict, checksum) + jr = journal.NewReader(reader, dropper{db.s, file}, strict, checksum) } else { - jr.Reset(reader, dropper{s, file}, strict, checksum) + jr.Reset(reader, dropper{db.s, file}, strict, checksum) } + + // Flush memdb and remove obsolete journal file. if of != nil { if mem.Len() > 0 { if err := cm.flush(mem, 0); err != nil { return err } } - if err := cm.commit(file.Num(), d.seq); err != nil { + if err := cm.commit(file.Num(), db.seq); err != nil { return err } cm.reset() of.Remove() of = nil } - // Reset memdb. + + // Replay journal to memdb. mem.Reset() for { r, err := jr.Next() @@ -445,43 +521,58 @@ func (d *DB) recoverJournal() error { if err == io.EOF { break } - return err + return errors.SetFile(err, file) } + buf.Reset() if _, err := buf.ReadFrom(r); err != nil { - if strict { - return err + if err == io.ErrUnexpectedEOF { + // This is error returned due to corruption, with strict == false. + continue + } else { + return errors.SetFile(err, file) } - continue } - if err := batch.decode(buf.Bytes()); err != nil { - return err - } - if err := batch.memReplay(mem); err != nil { - return err + if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mem); err != nil { + if strict || !errors.IsCorrupted(err) { + return errors.SetFile(err, file) + } else { + db.s.logf("journal error: %v (skipped)", err) + // We won't apply sequence number as it might be corrupted. + continue + } } - d.seq = batch.seq + uint64(batch.len()) + + // Save sequence number. + db.seq = batch.seq + uint64(batch.Len()) + + // Flush it if large enough. if mem.Size() >= writeBuffer { - // Large enough, flush it. if err := cm.flush(mem, 0); err != nil { return err } - // Reset memdb. mem.Reset() } } + of = file return nil } + // Recover all journals. - if len(ff2) > 0 { - s.logf("journal@recovery F·%d", len(ff2)) - mem = memdb.New(s.icmp, writeBuffer) - for _, file := range ff2 { + if len(journalFiles) > 0 { + db.logf("journal@recovery F·%d", len(journalFiles)) + + // Mark file number as used. + db.s.markFileNum(journalFiles[len(journalFiles)-1].Num()) + + mem = memdb.New(db.s.icmp, writeBuffer) + for _, file := range journalFiles { if err := recoverJournal(file); err != nil { return err } } + // Flush the last journal. if mem.Len() > 0 { if err := cm.flush(mem, 0); err != nil { @@ -489,72 +580,140 @@ func (d *DB) recoverJournal() error { } } } + // Create a new journal. - if _, err := d.newMem(0); err != nil { + if _, err := db.newMem(0); err != nil { return err } + // Commit. - if err := cm.commit(d.journalFile.Num(), d.seq); err != nil { + if err := cm.commit(db.journalFile.Num(), db.seq); err != nil { // Close journal. - if d.journal != nil { - d.journal.Close() - d.journalWriter.Close() + if db.journal != nil { + db.journal.Close() + db.journalWriter.Close() } return err } - // Remove the last journal. + + // Remove the last obsolete journal file. if of != nil { of.Remove() } + return nil } -func (d *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) { - s := d.s +func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) { + ikey := newIkey(key, seq, ktSeek) - ikey := newIKey(key, seq, tSeek) - - em, fm := d.getMems() - for _, m := range [...]*memdb.DB{em, fm} { + em, fm := db.getMems() + for _, m := range [...]*memDB{em, fm} { if m == nil { continue } - mk, mv, me := m.Find(ikey) + defer m.decref() + + mk, mv, me := m.mdb.Find(ikey) if me == nil { - ukey, _, t, ok := parseIkey(mk) - if ok && s.icmp.uCompare(ukey, key) == 0 { - if t == tDel { + ukey, _, kt, kerr := parseIkey(mk) + if kerr != nil { + // Shouldn't have had happen. + panic(kerr) + } + if db.s.icmp.uCompare(ukey, key) == 0 { + if kt == ktDel { return nil, ErrNotFound } - return mv, nil + return append([]byte{}, mv...), nil } } else if me != ErrNotFound { return nil, me } } - v := s.version() - value, cSched, err := v.get(ikey, ro) + v := db.s.version() + value, cSched, err := v.get(ikey, ro, false) v.release() if cSched { // Trigger table compaction. - d.compTrigger(d.tcompTriggerC) + db.compSendTrigger(db.tcompCmdC) + } + return +} + +func (db *DB) has(key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err error) { + ikey := newIkey(key, seq, ktSeek) + + em, fm := db.getMems() + for _, m := range [...]*memDB{em, fm} { + if m == nil { + continue + } + defer m.decref() + + mk, _, me := m.mdb.Find(ikey) + if me == nil { + ukey, _, kt, kerr := parseIkey(mk) + if kerr != nil { + // Shouldn't have had happen. + panic(kerr) + } + if db.s.icmp.uCompare(ukey, key) == 0 { + if kt == ktDel { + return false, nil + } + return true, nil + } + } else if me != ErrNotFound { + return false, me + } + } + + v := db.s.version() + _, cSched, err := v.get(ikey, ro, true) + v.release() + if cSched { + // Trigger table compaction. + db.compSendTrigger(db.tcompCmdC) + } + if err == nil { + ret = true + } else if err == ErrNotFound { + err = nil } return } // Get gets the value for the given key. It returns ErrNotFound if the -// DB does not contain the key. +// DB does not contains the key. // -// The caller should not modify the contents of the returned slice, but -// it is safe to modify the contents of the argument after Get returns. -func (d *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { - err = d.ok() +// The returned slice is its own copy, it is safe to modify the contents +// of the returned slice. +// It is safe to modify the contents of the argument after Get returns. +func (db *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + err = db.ok() if err != nil { return } - return d.get(key, d.getSeq(), ro) + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + return db.get(key, se.seq, ro) +} + +// Has returns true if the DB does contains the given key. +// +// It is safe to modify the contents of the argument after Get returns. +func (db *DB) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) { + err = db.ok() + if err != nil { + return + } + + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + return db.has(key, se.seq, ro) } // NewIterator returns an iterator for the latest snapshot of the @@ -573,14 +732,16 @@ func (d *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { // The iterator must be released after use, by calling Release method. // // Also read Iterator documentation of the leveldb/iterator package. -func (d *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { - if err := d.ok(); err != nil { +func (db *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + if err := db.ok(); err != nil { return iterator.NewEmptyIterator(err) } - p := d.newSnapshot() - defer p.Release() - return p.NewIterator(slice, ro) + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + // Iterator holds 'version' lock, 'version' is immutable so snapshot + // can be released after iterator created. + return db.newIterator(se.seq, slice, ro) } // GetSnapshot returns a latest snapshot of the underlying DB. A snapshot @@ -588,25 +749,35 @@ func (d *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterat // content of snapshot are guaranteed to be consistent. // // The snapshot must be released after use, by calling Release method. -func (d *DB) GetSnapshot() (*Snapshot, error) { - if err := d.ok(); err != nil { +func (db *DB) GetSnapshot() (*Snapshot, error) { + if err := db.ok(); err != nil { return nil, err } - return d.newSnapshot(), nil + return db.newSnapshot(), nil } // GetProperty returns value of the given property name. // // Property names: // leveldb.num-files-at-level{n} -// Returns the number of filer at level 'n'. +// Returns the number of files at level 'n'. // leveldb.stats // Returns statistics of the underlying DB. // leveldb.sstables // Returns sstables list for each level. -func (d *DB) GetProperty(name string) (value string, err error) { - err = d.ok() +// leveldb.blockpool +// Returns block pool stats. +// leveldb.cachedblock +// Returns size of cached block. +// leveldb.openedtables +// Returns number of opened tables. +// leveldb.alivesnaps +// Returns number of alive snapshots. +// leveldb.aliveiters +// Returns number of alive iterators. +func (db *DB) GetProperty(name string) (value string, err error) { + err = db.ok() if err != nil { return } @@ -615,19 +786,18 @@ func (d *DB) GetProperty(name string) (value string, err error) { if !strings.HasPrefix(name, prefix) { return "", errors.New("leveldb: GetProperty: unknown property: " + name) } - p := name[len(prefix):] - s := d.s - v := s.version() + v := db.s.version() defer v.release() + numFilesPrefix := "num-files-at-level" switch { - case strings.HasPrefix(p, "num-files-at-level"): + case strings.HasPrefix(p, numFilesPrefix): var level uint var rest string - n, _ := fmt.Scanf("%d%s", &level, &rest) - if n != 1 || level >= kNumLevels { + n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest) + if n != 1 || int(level) >= db.s.o.GetNumLevel() { err = errors.New("leveldb: GetProperty: invalid property: " + name) } else { value = fmt.Sprint(v.tLen(int(level))) @@ -636,22 +806,36 @@ func (d *DB) GetProperty(name string) (value string, err error) { value = "Compactions\n" + " Level | Tables | Size(MB) | Time(sec) | Read(MB) | Write(MB)\n" + "-------+------------+---------------+---------------+---------------+---------------\n" - for level, tt := range v.tables { - duration, read, write := d.compStats[level].get() - if len(tt) == 0 && duration == 0 { + for level, tables := range v.tables { + duration, read, write := db.compStats[level].get() + if len(tables) == 0 && duration == 0 { continue } value += fmt.Sprintf(" %3d | %10d | %13.5f | %13.5f | %13.5f | %13.5f\n", - level, len(tt), float64(tt.size())/1048576.0, duration.Seconds(), + level, len(tables), float64(tables.size())/1048576.0, duration.Seconds(), float64(read)/1048576.0, float64(write)/1048576.0) } case p == "sstables": - for level, tt := range v.tables { + for level, tables := range v.tables { value += fmt.Sprintf("--- level %d ---\n", level) - for _, t := range tt { - value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.file.Num(), t.size, t.min, t.max) + for _, t := range tables { + value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.file.Num(), t.size, t.imin, t.imax) } } + case p == "blockpool": + value = fmt.Sprintf("%v", db.s.tops.bpool) + case p == "cachedblock": + if db.s.tops.bcache != nil { + value = fmt.Sprintf("%d", db.s.tops.bcache.Size()) + } else { + value = "" + } + case p == "openedtables": + value = fmt.Sprintf("%d", db.s.tops.cache.Size()) + case p == "alivesnaps": + value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveSnaps)) + case p == "aliveiters": + value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveIters)) default: err = errors.New("leveldb: GetProperty: unknown property: " + name) } @@ -665,23 +849,23 @@ func (d *DB) GetProperty(name string) (value string, err error) { // data compresses by a factor of ten, the returned sizes will be one-tenth // the size of the corresponding user data size. // The results may not include the sizes of recently written data. -func (d *DB) SizeOf(ranges []util.Range) (Sizes, error) { - if err := d.ok(); err != nil { +func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) { + if err := db.ok(); err != nil { return nil, err } - v := d.s.version() + v := db.s.version() defer v.release() sizes := make(Sizes, 0, len(ranges)) for _, r := range ranges { - min := newIKey(r.Start, kMaxSeq, tSeek) - max := newIKey(r.Limit, kMaxSeq, tSeek) - start, err := v.offsetOf(min) + imin := newIkey(r.Start, kMaxSeq, ktSeek) + imax := newIkey(r.Limit, kMaxSeq, ktSeek) + start, err := v.offsetOf(imin) if err != nil { return nil, err } - limit, err := v.offsetOf(max) + limit, err := v.offsetOf(imax) if err != nil { return nil, err } @@ -695,61 +879,67 @@ func (d *DB) SizeOf(ranges []util.Range) (Sizes, error) { return sizes, nil } -// Close closes the DB. This will also releases any outstanding snapshot. +// Close closes the DB. This will also releases any outstanding snapshot and +// abort any in-flight compaction. // // It is not safe to close a DB until all outstanding iterators are released. // It is valid to call Close multiple times. Other methods should not be // called after the DB has been closed. -func (d *DB) Close() error { - if !d.setClosed() { +func (db *DB) Close() error { + if !db.setClosed() { return ErrClosed } - s := d.s start := time.Now() - s.log("db@close closing") + db.log("db@close closing") // Clear the finalizer. - runtime.SetFinalizer(d, nil) + runtime.SetFinalizer(db, nil) // Get compaction error. var err error select { - case err = <-d.compErrC: + case err = <-db.compErrC: default: } - close(d.closeC) + // Signal all goroutines. + close(db.closeC) - // Wait for the close WaitGroup. - d.closeW.Wait() + // Wait for all gorotines to exit. + db.closeW.Wait() + + // Lock writer and closes journal. + db.writeLockC <- struct{}{} + if db.journal != nil { + db.journal.Close() + db.journalWriter.Close() + } - // Close journal. - if d.journal != nil { - d.journal.Close() - d.journalWriter.Close() + if db.writeDelayN > 0 { + db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) } // Close session. - s.close() - s.logf("db@close done T·%v", time.Since(start)) - s.release() + db.s.close() + db.logf("db@close done T·%v", time.Since(start)) + db.s.release() - if d.closer != nil { - if err1 := d.closer.Close(); err == nil { + if db.closer != nil { + if err1 := db.closer.Close(); err == nil { err = err1 } } - d.s = nil - d.mem = nil - d.frozenMem = nil - d.journal = nil - d.journalWriter = nil - d.journalFile = nil - d.frozenJournalFile = nil - d.snapsRoot = snapshotElement{} - d.closer = nil + // NIL'ing pointers. + db.s = nil + db.mem = nil + db.frozenMem = nil + db.journal = nil + db.journalWriter = nil + db.journalFile = nil + db.frozenJournalFile = nil + db.closer = nil return err } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go index c82bd9f28..447407aba 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go @@ -7,11 +7,12 @@ package leveldb import ( - "errors" "sync" "time" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/opt" ) var ( @@ -68,13 +69,13 @@ type cMem struct { } func newCMem(s *session) *cMem { - return &cMem{s: s, rec: new(sessionRecord)} + return &cMem{s: s, rec: &sessionRecord{numLevel: s.o.GetNumLevel()}} } func (c *cMem) flush(mem *memdb.DB, level int) error { s := c.s - // Write memdb to table + // Write memdb to table. iter := mem.NewIterator(nil) defer iter.Release() t, n, err := s.tops.createFrom(iter) @@ -82,51 +83,85 @@ func (c *cMem) flush(mem *memdb.DB, level int) error { return err } + // Pick level. if level < 0 { - level = s.version_NB().pickLevel(t.min.ukey(), t.max.ukey()) + v := s.version() + level = v.pickLevel(t.imin.ukey(), t.imax.ukey()) + v.release() } c.rec.addTableFile(level, t) - s.logf("mem@flush created L%d@%d N·%d S·%s %q:%q", level, t.file.Num(), n, shortenb(int(t.size)), t.min, t.max) + s.logf("mem@flush created L%d@%d N·%d S·%s %q:%q", level, t.file.Num(), n, shortenb(int(t.size)), t.imin, t.imax) c.level = level return nil } func (c *cMem) reset() { - c.rec = new(sessionRecord) + c.rec = &sessionRecord{numLevel: c.s.o.GetNumLevel()} } func (c *cMem) commit(journal, seq uint64) error { c.rec.setJournalNum(journal) - c.rec.setSeq(seq) - // Commit changes + c.rec.setSeqNum(seq) + + // Commit changes. return c.s.commit(c.rec) } -func (d *DB) compactionError() { - var err error +func (db *DB) compactionError() { + var ( + err error + wlocked bool + ) noerr: + // No error. for { select { - case _, _ = <-d.closeC: - return - case err = <-d.compErrSetC: - if err != nil { + case err = <-db.compErrSetC: + switch { + case err == nil: + case errors.IsCorrupted(err): + goto hasperr + default: goto haserr } + case _, _ = <-db.closeC: + return } } haserr: + // Transient error. for { select { - case _, _ = <-d.closeC: - return - case err = <-d.compErrSetC: - if err == nil { + case db.compErrC <- err: + case err = <-db.compErrSetC: + switch { + case err == nil: goto noerr + case errors.IsCorrupted(err): + goto hasperr + default: } - case d.compErrC <- err: + case _, _ = <-db.closeC: + return + } + } +hasperr: + // Persistent error. + for { + select { + case db.compErrC <- err: + case db.compPerErrC <- err: + case db.writeLockC <- struct{}{}: + // Hold write lock, so that write won't pass-through. + wlocked = true + case _, _ = <-db.closeC: + if wlocked { + // We should release the lock or Close will hang. + <-db.writeLockC + } + return } } } @@ -137,114 +172,159 @@ func (cnt *compactionTransactCounter) incr() { *cnt++ } -func (d *DB) compactionTransact(name string, exec func(cnt *compactionTransactCounter) error, rollback func() error) { - s := d.s +type compactionTransactInterface interface { + run(cnt *compactionTransactCounter) error + revert() error +} + +func (db *DB) compactionTransact(name string, t compactionTransactInterface) { defer func() { if x := recover(); x != nil { - if x == errCompactionTransactExiting && rollback != nil { - if err := rollback(); err != nil { - s.logf("%s rollback error %q", name, err) + if x == errCompactionTransactExiting { + if err := t.revert(); err != nil { + db.logf("%s revert error %q", name, err) } } panic(x) } }() + const ( backoffMin = 1 * time.Second backoffMax = 8 * time.Second backoffMul = 2 * time.Second ) - backoff := backoffMin - backoffT := time.NewTimer(backoff) - lastCnt := compactionTransactCounter(0) + var ( + backoff = backoffMin + backoffT = time.NewTimer(backoff) + lastCnt = compactionTransactCounter(0) + + disableBackoff = db.s.o.GetDisableCompactionBackoff() + ) for n := 0; ; n++ { // Check wether the DB is closed. - if d.isClosed() { - s.logf("%s exiting", name) - d.compactionExitTransact() + if db.isClosed() { + db.logf("%s exiting", name) + db.compactionExitTransact() } else if n > 0 { - s.logf("%s retrying N·%d", name, n) + db.logf("%s retrying N·%d", name, n) } // Execute. cnt := compactionTransactCounter(0) - err := exec(&cnt) + err := t.run(&cnt) + if err != nil { + db.logf("%s error I·%d %q", name, cnt, err) + } // Set compaction error status. select { - case d.compErrSetC <- err: - case _, _ = <-d.closeC: - s.logf("%s exiting", name) - d.compactionExitTransact() + case db.compErrSetC <- err: + case perr := <-db.compPerErrC: + if err != nil { + db.logf("%s exiting (persistent error %q)", name, perr) + db.compactionExitTransact() + } + case _, _ = <-db.closeC: + db.logf("%s exiting", name) + db.compactionExitTransact() } if err == nil { return } - s.logf("%s error I·%d %q", name, cnt, err) - - // Reset backoff duration if counter is advancing. - if cnt > lastCnt { - backoff = backoffMin - lastCnt = cnt + if errors.IsCorrupted(err) { + db.logf("%s exiting (corruption detected)", name) + db.compactionExitTransact() } - // Backoff. - backoffT.Reset(backoff) - if backoff < backoffMax { - backoff *= backoffMul - if backoff > backoffMax { - backoff = backoffMax + if !disableBackoff { + // Reset backoff duration if counter is advancing. + if cnt > lastCnt { + backoff = backoffMin + lastCnt = cnt + } + + // Backoff. + backoffT.Reset(backoff) + if backoff < backoffMax { + backoff *= backoffMul + if backoff > backoffMax { + backoff = backoffMax + } + } + select { + case <-backoffT.C: + case _, _ = <-db.closeC: + db.logf("%s exiting", name) + db.compactionExitTransact() } } - select { - case <-backoffT.C: - case _, _ = <-d.closeC: - s.logf("%s exiting", name) - d.compactionExitTransact() - } } } -func (d *DB) compactionExitTransact() { +type compactionTransactFunc struct { + runFunc func(cnt *compactionTransactCounter) error + revertFunc func() error +} + +func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error { + return t.runFunc(cnt) +} + +func (t *compactionTransactFunc) revert() error { + if t.revertFunc != nil { + return t.revertFunc() + } + return nil +} + +func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) { + db.compactionTransact(name, &compactionTransactFunc{run, revert}) +} + +func (db *DB) compactionExitTransact() { panic(errCompactionTransactExiting) } -func (d *DB) memCompaction() { - mem := d.getFrozenMem() +func (db *DB) memCompaction() { + mem := db.getFrozenMem() if mem == nil { return } + defer mem.decref() - s := d.s - c := newCMem(s) + c := newCMem(db.s) stats := new(cStatsStaging) - s.logf("mem@flush N·%d S·%s", mem.Len(), shortenb(mem.Size())) + db.logf("mem@flush N·%d S·%s", mem.mdb.Len(), shortenb(mem.mdb.Size())) // Don't compact empty memdb. - if mem.Len() == 0 { - s.logf("mem@flush skipping") + if mem.mdb.Len() == 0 { + db.logf("mem@flush skipping") // drop frozen mem - d.dropFrozenMem() + db.dropFrozenMem() return } // Pause table compaction. - ch := make(chan struct{}) + resumeC := make(chan struct{}) select { - case d.tcompPauseC <- (chan<- struct{})(ch): - case _, _ = <-d.closeC: + case db.tcompPauseC <- (chan<- struct{})(resumeC): + case <-db.compPerErrC: + close(resumeC) + resumeC = nil + case _, _ = <-db.closeC: return } - d.compactionTransact("mem@flush", func(cnt *compactionTransactCounter) (err error) { + db.compactionTransactFunc("mem@flush", func(cnt *compactionTransactCounter) (err error) { stats.startTimer() defer stats.stopTimer() - return c.flush(mem, -1) + return c.flush(mem.mdb, -1) }, func() error { for _, r := range c.rec.addedTables { - s.logf("mem@flush rollback @%d", r.num) - f := s.getTableFile(r.num) + db.logf("mem@flush revert @%d", r.num) + f := db.s.getTableFile(r.num) if err := f.Remove(); err != nil { return err } @@ -252,279 +332,327 @@ func (d *DB) memCompaction() { return nil }) - d.compactionTransact("mem@commit", func(cnt *compactionTransactCounter) (err error) { + db.compactionTransactFunc("mem@commit", func(cnt *compactionTransactCounter) (err error) { stats.startTimer() defer stats.stopTimer() - return c.commit(d.journalFile.Num(), d.frozenSeq) + return c.commit(db.journalFile.Num(), db.frozenSeq) }, nil) - s.logf("mem@flush commited F·%d T·%v", len(c.rec.addedTables), stats.duration) + db.logf("mem@flush committed F·%d T·%v", len(c.rec.addedTables), stats.duration) for _, r := range c.rec.addedTables { stats.write += r.size } - d.compStats[c.level].add(stats) + db.compStats[c.level].add(stats) // Drop frozen mem. - d.dropFrozenMem() + db.dropFrozenMem() // Resume table compaction. - select { - case <-ch: - case _, _ = <-d.closeC: - return + if resumeC != nil { + select { + case <-resumeC: + close(resumeC) + case _, _ = <-db.closeC: + return + } } // Trigger table compaction. - d.compTrigger(d.mcompTriggerC) + db.compSendTrigger(db.tcompCmdC) } -func (d *DB) tableCompaction(c *compaction, noTrivial bool) { - s := d.s +type tableCompactionBuilder struct { + db *DB + s *session + c *compaction + rec *sessionRecord + stat0, stat1 *cStatsStaging - rec := new(sessionRecord) - rec.addCompactionPointer(c.level, c.max) + snapHasLastUkey bool + snapLastUkey []byte + snapLastSeq uint64 + snapIter int + snapKerrCnt int + snapDropCnt int - if !noTrivial && c.trivial() { - t := c.tables[0][0] - s.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1) - rec.deleteTable(c.level, t.file.Num()) - rec.addTableFile(c.level+1, t) - d.compactionTransact("table@move", func(cnt *compactionTransactCounter) (err error) { - return s.commit(rec) - }, nil) - return - } + kerrCnt int + dropCnt int - var stats [2]cStatsStaging - for i, tt := range c.tables { - for _, t := range tt { - stats[i].read += t.size - // Insert deleted tables into record - rec.deleteTable(c.level+i, t.file.Num()) - } - } - sourceSize := int(stats[0].read + stats[1].read) - minSeq := d.minSeq() - s.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq) - - var snapUkey []byte - var snapHasUkey bool - var snapSeq uint64 - var snapIter int - var snapDropCnt int - var dropCnt int - d.compactionTransact("table@build", func(cnt *compactionTransactCounter) (err error) { - ukey := append([]byte{}, snapUkey...) - hasUkey := snapHasUkey - lseq := snapSeq - dropCnt = snapDropCnt - snapSched := snapIter == 0 - - var tw *tWriter - finish := func() error { - t, err := tw.finish() - if err != nil { - return err + minSeq uint64 + strict bool + tableSize int + + tw *tWriter +} + +func (b *tableCompactionBuilder) appendKV(key, value []byte) error { + // Create new table if not already. + if b.tw == nil { + // Check for pause event. + if b.db != nil { + select { + case ch := <-b.db.tcompPauseC: + b.db.pauseCompaction(ch) + case _, _ = <-b.db.closeC: + b.db.compactionExitTransact() + default: } - rec.addTableFile(c.level+1, t) - stats[1].write += t.size - s.logf("table@build created L%d@%d N·%d S·%s %q:%q", c.level+1, t.file.Num(), tw.tw.EntriesLen(), shortenb(int(t.size)), t.min, t.max) - return nil } - defer func() { - stats[1].stopTimer() - if tw != nil { - tw.drop() - tw = nil - } - }() + // Create new table. + var err error + b.tw, err = b.s.tops.create() + if err != nil { + return err + } + } - stats[1].startTimer() - iter := c.newIterator() - defer iter.Release() - for i := 0; iter.Next(); i++ { - // Incr transact counter. - cnt.incr() - - // Skip until last state. - if i < snapIter { - continue - } + // Write key/value into table. + return b.tw.append(key, value) +} - key := iKey(iter.Key()) +func (b *tableCompactionBuilder) needFlush() bool { + return b.tw.tw.BytesLen() >= b.tableSize +} - if c.shouldStopBefore(key) && tw != nil { - err = finish() - if err != nil { - return - } - snapSched = true - tw = nil - } +func (b *tableCompactionBuilder) flush() error { + t, err := b.tw.finish() + if err != nil { + return err + } + b.rec.addTableFile(b.c.level+1, t) + b.stat1.write += t.size + b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.level+1, t.file.Num(), b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax) + b.tw = nil + return nil +} - // Scheduled for snapshot, snapshot will used to retry compaction - // if error occured. - if snapSched { - snapUkey = append(snapUkey[:0], ukey...) - snapHasUkey = hasUkey - snapSeq = lseq - snapIter = i - snapDropCnt = dropCnt - snapSched = false - } +func (b *tableCompactionBuilder) cleanup() { + if b.tw != nil { + b.tw.drop() + b.tw = nil + } +} - if seq, t, ok := key.parseNum(); !ok { - // Don't drop error keys - ukey = ukey[:0] - hasUkey = false - lseq = kMaxSeq - } else { - if !hasUkey || s.icmp.uCompare(key.ukey(), ukey) != 0 { - // First occurrence of this user key - ukey = append(ukey[:0], key.ukey()...) - hasUkey = true - lseq = kMaxSeq - } +func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error { + snapResumed := b.snapIter > 0 + hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary. + lastUkey := append([]byte{}, b.snapLastUkey...) + lastSeq := b.snapLastSeq + b.kerrCnt = b.snapKerrCnt + b.dropCnt = b.snapDropCnt + // Restore compaction state. + b.c.restore() - drop := false - if lseq <= minSeq { - // Dropped because newer entry for same user key exist - drop = true // (A) - } else if t == tDel && seq <= minSeq && c.isBaseLevelForKey(ukey) { - // For this user key: - // (1) there is no data in higher levels - // (2) data in lower levels will have larger seq numbers - // (3) data in layers that are being compacted here and have - // smaller seq numbers will be dropped in the next - // few iterations of this loop (by rule (A) above). - // Therefore this deletion marker is obsolete and can be dropped. - drop = true - } + defer b.cleanup() - lseq = seq - if drop { - dropCnt++ - continue - } - } + b.stat1.startTimer() + defer b.stat1.stopTimer() - // Create new table if not already - if tw == nil { - // Check for pause event. - select { - case ch := <-d.tcompPauseC: - d.pauseCompaction(ch) - case _, _ = <-d.closeC: - d.compactionExitTransact() - default: - } + iter := b.c.newIterator() + defer iter.Release() + for i := 0; iter.Next(); i++ { + // Incr transact counter. + cnt.incr() + + // Skip until last state. + if i < b.snapIter { + continue + } - // Create new table. - tw, err = s.tops.create() - if err != nil { - return + resumed := false + if snapResumed { + resumed = true + snapResumed = false + } + + ikey := iter.Key() + ukey, seq, kt, kerr := parseIkey(ikey) + + if kerr == nil { + shouldStop := !resumed && b.c.shouldStopBefore(ikey) + + if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 { + // First occurrence of this user key. + + // Only rotate tables if ukey doesn't hop across. + if b.tw != nil && (shouldStop || b.needFlush()) { + if err := b.flush(); err != nil { + return err + } + + // Creates snapshot of the state. + b.c.save() + b.snapHasLastUkey = hasLastUkey + b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...) + b.snapLastSeq = lastSeq + b.snapIter = i + b.snapKerrCnt = b.kerrCnt + b.snapDropCnt = b.dropCnt } - } - // Write key/value into table - err = tw.add(key, iter.Value()) - if err != nil { - return + hasLastUkey = true + lastUkey = append(lastUkey[:0], ukey...) + lastSeq = kMaxSeq } - // Finish table if it is big enough - if tw.tw.BytesLen() >= kMaxTableSize { - err = finish() - if err != nil { - return - } - snapSched = true - tw = nil + switch { + case lastSeq <= b.minSeq: + // Dropped because newer entry for same user key exist + fallthrough // (A) + case kt == ktDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey): + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger seq numbers + // (3) data in layers that are being compacted here and have + // smaller seq numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + lastSeq = seq + b.dropCnt++ + continue + default: + lastSeq = seq + } + } else { + if b.strict { + return kerr } + + // Don't drop corrupted keys. + hasLastUkey = false + lastUkey = lastUkey[:0] + lastSeq = kMaxSeq + b.kerrCnt++ } - err = iter.Error() - if err != nil { - return + if err := b.appendKV(ikey, iter.Value()); err != nil { + return err } + } - // Finish last table - if tw != nil && !tw.empty() { - err = finish() - if err != nil { - return - } - tw = nil + if err := iter.Error(); err != nil { + return err + } + + // Finish last table. + if b.tw != nil && !b.tw.empty() { + return b.flush() + } + return nil +} + +func (b *tableCompactionBuilder) revert() error { + for _, at := range b.rec.addedTables { + b.s.logf("table@build revert @%d", at.num) + f := b.s.getTableFile(at.num) + if err := f.Remove(); err != nil { + return err } + } + return nil +} + +func (db *DB) tableCompaction(c *compaction, noTrivial bool) { + defer c.release() + + rec := &sessionRecord{numLevel: db.s.o.GetNumLevel()} + rec.addCompPtr(c.level, c.imax) + + if !noTrivial && c.trivial() { + t := c.tables[0][0] + db.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1) + rec.delTable(c.level, t.file.Num()) + rec.addTableFile(c.level+1, t) + db.compactionTransactFunc("table@move", func(cnt *compactionTransactCounter) (err error) { + return db.s.commit(rec) + }, nil) return - }, func() error { - for _, r := range rec.addedTables { - s.logf("table@build rollback @%d", r.num) - f := s.getTableFile(r.num) - if err := f.Remove(); err != nil { - return err - } + } + + var stats [2]cStatsStaging + for i, tables := range c.tables { + for _, t := range tables { + stats[i].read += t.size + // Insert deleted tables into record + rec.delTable(c.level+i, t.file.Num()) } - return nil - }) + } + sourceSize := int(stats[0].read + stats[1].read) + minSeq := db.minSeq() + db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq) + + b := &tableCompactionBuilder{ + db: db, + s: db.s, + c: c, + rec: rec, + stat1: &stats[1], + minSeq: minSeq, + strict: db.s.o.GetStrict(opt.StrictCompaction), + tableSize: db.s.o.GetCompactionTableSize(c.level + 1), + } + db.compactionTransact("table@build", b) // Commit changes - d.compactionTransact("table@commit", func(cnt *compactionTransactCounter) (err error) { + db.compactionTransactFunc("table@commit", func(cnt *compactionTransactCounter) (err error) { stats[1].startTimer() defer stats[1].stopTimer() - return s.commit(rec) + return db.s.commit(rec) }, nil) - resultSize := int(int(stats[1].write)) - s.logf("table@compaction commited F%s S%s D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), dropCnt, stats[1].duration) + resultSize := int(stats[1].write) + db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration) // Save compaction stats for i := range stats { - d.compStats[c.level+1].add(&stats[i]) + db.compStats[c.level+1].add(&stats[i]) } } -func (d *DB) tableRangeCompaction(level int, min, max []byte) { - s := d.s - s.logf("table@compaction range L%d %q:%q", level, min, max) +func (db *DB) tableRangeCompaction(level int, umin, umax []byte) { + db.logf("table@compaction range L%d %q:%q", level, umin, umax) if level >= 0 { - if c := s.getCompactionRange(level, min, max); c != nil { - d.tableCompaction(c, true) + if c := db.s.getCompactionRange(level, umin, umax); c != nil { + db.tableCompaction(c, true) } } else { - v := s.version_NB() + v := db.s.version() m := 1 for i, t := range v.tables[1:] { - if t.isOverlaps(min, max, true, s.icmp) { + if t.overlaps(db.s.icmp, umin, umax, false) { m = i + 1 } } + v.release() + for level := 0; level < m; level++ { - if c := s.getCompactionRange(level, min, max); c != nil { - d.tableCompaction(c, true) + if c := db.s.getCompactionRange(level, umin, umax); c != nil { + db.tableCompaction(c, true) } } } } -func (d *DB) tableAutoCompaction() { - if c := d.s.pickCompaction(); c != nil { - d.tableCompaction(c, false) +func (db *DB) tableAutoCompaction() { + if c := db.s.pickCompaction(); c != nil { + db.tableCompaction(c, false) } } -func (d *DB) tableNeedCompaction() bool { - return d.s.version_NB().needCompaction() +func (db *DB) tableNeedCompaction() bool { + v := db.s.version() + defer v.release() + return v.needCompaction() } -func (d *DB) pauseCompaction(ch chan<- struct{}) { +func (db *DB) pauseCompaction(ch chan<- struct{}) { select { case ch <- struct{}{}: - case _, _ = <-d.closeC: - d.compactionExitTransact() + case _, _ = <-db.closeC: + db.compactionExitTransact() } } @@ -537,7 +665,12 @@ type cIdle struct { } func (r cIdle) ack(err error) { - r.ackC <- err + if r.ackC != nil { + defer func() { + recover() + }() + r.ackC <- err + } } type cRange struct { @@ -547,56 +680,67 @@ type cRange struct { } func (r cRange) ack(err error) { - defer func() { - recover() - }() if r.ackC != nil { + defer func() { + recover() + }() r.ackC <- err } } -func (d *DB) compSendIdle(compC chan<- cCmd) error { +// This will trigger auto compation and/or wait for all compaction to be done. +func (db *DB) compSendIdle(compC chan<- cCmd) (err error) { ch := make(chan error) defer close(ch) // Send cmd. select { case compC <- cIdle{ch}: - case err := <-d.compErrC: - return err - case _, _ = <-d.closeC: + case err = <-db.compErrC: + return + case _, _ = <-db.closeC: return ErrClosed } // Wait cmd. - return <-ch + select { + case err = <-ch: + case err = <-db.compErrC: + case _, _ = <-db.closeC: + return ErrClosed + } + return err } -func (d *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err error) { +// This will trigger auto compaction but will not wait for it. +func (db *DB) compSendTrigger(compC chan<- cCmd) { + select { + case compC <- cIdle{}: + default: + } +} + +// Send range compaction request. +func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err error) { ch := make(chan error) defer close(ch) // Send cmd. select { case compC <- cRange{level, min, max, ch}: - case err := <-d.compErrC: + case err := <-db.compErrC: return err - case _, _ = <-d.closeC: + case _, _ = <-db.closeC: return ErrClosed } // Wait cmd. select { - case err = <-d.compErrC: case err = <-ch: + case err = <-db.compErrC: + case _, _ = <-db.closeC: + return ErrClosed } return err } -func (d *DB) compTrigger(compTriggerC chan struct{}) { - select { - case compTriggerC <- struct{}{}: - default: - } -} - -func (d *DB) mCompaction() { +func (db *DB) mCompaction() { var x cCmd defer func() { @@ -608,24 +752,27 @@ func (d *DB) mCompaction() { if x != nil { x.ack(ErrClosed) } - d.closeW.Done() + db.closeW.Done() }() for { select { - case _, _ = <-d.closeC: + case x = <-db.mcompCmdC: + switch x.(type) { + case cIdle: + db.memCompaction() + x.ack(nil) + x = nil + default: + panic("leveldb: unknown command") + } + case _, _ = <-db.closeC: return - case x = <-d.mcompCmdC: - d.memCompaction() - x.ack(nil) - x = nil - case <-d.mcompTriggerC: - d.memCompaction() } } } -func (d *DB) tCompaction() { +func (db *DB) tCompaction() { var x cCmd var ackQ []cCmd @@ -642,19 +789,18 @@ func (d *DB) tCompaction() { if x != nil { x.ack(ErrClosed) } - d.closeW.Done() + db.closeW.Done() }() for { - if d.tableNeedCompaction() { + if db.tableNeedCompaction() { select { - case x = <-d.tcompCmdC: - case <-d.tcompTriggerC: - case _, _ = <-d.closeC: - return - case ch := <-d.tcompPauseC: - d.pauseCompaction(ch) + case x = <-db.tcompCmdC: + case ch := <-db.tcompPauseC: + db.pauseCompaction(ch) continue + case _, _ = <-db.closeC: + return default: } } else { @@ -664,12 +810,11 @@ func (d *DB) tCompaction() { } ackQ = ackQ[:0] select { - case x = <-d.tcompCmdC: - case <-d.tcompTriggerC: - case ch := <-d.tcompPauseC: - d.pauseCompaction(ch) + case x = <-db.tcompCmdC: + case ch := <-db.tcompPauseC: + db.pauseCompaction(ch) continue - case _, _ = <-d.closeC: + case _, _ = <-db.closeC: return } } @@ -678,11 +823,13 @@ func (d *DB) tCompaction() { case cIdle: ackQ = append(ackQ, x) case cRange: - d.tableRangeCompaction(cmd.level, cmd.min, cmd.max) + db.tableRangeCompaction(cmd.level, cmd.min, cmd.max) x.ack(nil) + default: + panic("leveldb: unknown command") } x = nil } - d.tableAutoCompaction() + db.tableAutoCompaction() } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go index 9973a8fef..011a94a35 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go @@ -8,7 +8,10 @@ package leveldb import ( "errors" + "math/rand" "runtime" + "sync" + "sync/atomic" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/opt" @@ -19,50 +22,69 @@ var ( errInvalidIkey = errors.New("leveldb: Iterator: invalid internal key") ) -func (db *DB) newRawIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { - s := db.s +type memdbReleaser struct { + once sync.Once + m *memDB +} + +func (mr *memdbReleaser) Release() { + mr.once.Do(func() { + mr.m.decref() + }) +} +func (db *DB) newRawIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { em, fm := db.getMems() - v := s.version() + v := db.s.version() ti := v.getIterators(slice, ro) n := len(ti) + 2 i := make([]iterator.Iterator, 0, n) - i = append(i, em.NewIterator(slice)) + emi := em.mdb.NewIterator(slice) + emi.SetReleaser(&memdbReleaser{m: em}) + i = append(i, emi) if fm != nil { - i = append(i, fm.NewIterator(slice)) + fmi := fm.mdb.NewIterator(slice) + fmi.SetReleaser(&memdbReleaser{m: fm}) + i = append(i, fmi) } i = append(i, ti...) - strict := s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator) - mi := iterator.NewMergedIterator(i, s.icmp, strict) + strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader) + mi := iterator.NewMergedIterator(i, db.s.icmp, strict) mi.SetReleaser(&versionReleaser{v: v}) return mi } func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *dbIter { - var slice_ *util.Range + var islice *util.Range if slice != nil { - slice_ = &util.Range{} + islice = &util.Range{} if slice.Start != nil { - slice_.Start = newIKey(slice.Start, kMaxSeq, tSeek) + islice.Start = newIkey(slice.Start, kMaxSeq, ktSeek) } if slice.Limit != nil { - slice_.Limit = newIKey(slice.Limit, kMaxSeq, tSeek) + islice.Limit = newIkey(slice.Limit, kMaxSeq, ktSeek) } } - rawIter := db.newRawIterator(slice_, ro) + rawIter := db.newRawIterator(islice, ro) iter := &dbIter{ + db: db, icmp: db.s.icmp, iter: rawIter, seq: seq, - strict: db.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator), + strict: opt.GetStrict(db.s.o.Options, ro, opt.StrictReader), key: make([]byte, 0), value: make([]byte, 0), } + atomic.AddInt32(&db.aliveIters, 1) runtime.SetFinalizer(iter, (*dbIter).Release) return iter } +func (db *DB) iterSamplingRate() int { + return rand.Intn(2 * db.s.o.GetIteratorSamplingRate()) +} + type dir int const ( @@ -75,16 +97,27 @@ const ( // dbIter represent an interator states over a database session. type dbIter struct { + db *DB icmp *iComparer iter iterator.Iterator seq uint64 strict bool - dir dir - key []byte - value []byte - err error - releaser util.Releaser + smaplingGap int + dir dir + key []byte + value []byte + err error + releaser util.Releaser +} + +func (i *dbIter) sampleSeek() { + ikey := i.iter.Key() + i.smaplingGap -= len(ikey) + len(i.iter.Value()) + for i.smaplingGap < 0 { + i.smaplingGap += i.db.iterSamplingRate() + i.db.sampleSeek(ikey) + } } func (i *dbIter) setErr(err error) { @@ -144,7 +177,7 @@ func (i *dbIter) Seek(key []byte) bool { return false } - ikey := newIKey(key, i.seq, tSeek) + ikey := newIkey(key, i.seq, ktSeek) if i.iter.Seek(ikey) { i.dir = dirSOI return i.next() @@ -156,15 +189,15 @@ func (i *dbIter) Seek(key []byte) bool { func (i *dbIter) next() bool { for { - ukey, seq, t, ok := parseIkey(i.iter.Key()) - if ok { + if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil { + i.sampleSeek() if seq <= i.seq { - switch t { - case tDel: + switch kt { + case ktDel: // Skip deleted key. i.key = append(i.key[:0], ukey...) i.dir = dirForward - case tVal: + case ktVal: if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 { i.key = append(i.key[:0], ukey...) i.value = append(i.value[:0], i.iter.Value()...) @@ -174,7 +207,7 @@ func (i *dbIter) next() bool { } } } else if i.strict { - i.setErr(errInvalidIkey) + i.setErr(kerr) break } if !i.iter.Next() { @@ -207,20 +240,20 @@ func (i *dbIter) prev() bool { del := true if i.iter.Valid() { for { - ukey, seq, t, ok := parseIkey(i.iter.Key()) - if ok { + if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil { + i.sampleSeek() if seq <= i.seq { if !del && i.icmp.uCompare(ukey, i.key) < 0 { return true } - del = (t == tDel) + del = (kt == ktDel) if !del { i.key = append(i.key[:0], ukey...) i.value = append(i.value[:0], i.iter.Value()...) } } } else if i.strict { - i.setErr(errInvalidIkey) + i.setErr(kerr) return false } if !i.iter.Prev() { @@ -249,13 +282,13 @@ func (i *dbIter) Prev() bool { return i.Last() case dirForward: for i.iter.Prev() { - ukey, _, _, ok := parseIkey(i.iter.Key()) - if ok { + if ukey, _, _, kerr := parseIkey(i.iter.Key()); kerr == nil { + i.sampleSeek() if i.icmp.uCompare(ukey, i.key) < 0 { goto cont } } else if i.strict { - i.setErr(errInvalidIkey) + i.setErr(kerr) return false } } @@ -289,6 +322,7 @@ func (i *dbIter) Release() { if i.releaser != nil { i.releaser.Release() + i.releaser = nil } i.dir = dirReleased @@ -296,13 +330,19 @@ func (i *dbIter) Release() { i.value = nil i.iter.Release() i.iter = nil + atomic.AddInt32(&i.db.aliveIters, -1) + i.db = nil } } func (i *dbIter) SetReleaser(releaser util.Releaser) { - if i.dir != dirReleased { - i.releaser = releaser + if i.dir == dirReleased { + panic(util.ErrReleased) + } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) } + i.releaser = releaser } func (i *dbIter) Error() error { diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_snapshot.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_snapshot.go index 225b7cd5e..0372848ff 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_snapshot.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_snapshot.go @@ -7,8 +7,11 @@ package leveldb import ( + "container/list" + "fmt" "runtime" "sync" + "sync/atomic" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/opt" @@ -18,51 +21,41 @@ import ( type snapshotElement struct { seq uint64 ref int - // Next and previous pointers in the doubly-linked list of elements. - next, prev *snapshotElement -} - -// Initialize the snapshot. -func (db *DB) initSnapshot() { - db.snapsRoot.next = &db.snapsRoot - db.snapsRoot.prev = &db.snapsRoot + e *list.Element } // Acquires a snapshot, based on latest sequence. func (db *DB) acquireSnapshot() *snapshotElement { db.snapsMu.Lock() + defer db.snapsMu.Unlock() + seq := db.getSeq() - elem := db.snapsRoot.prev - if elem == &db.snapsRoot || elem.seq != seq { - at := db.snapsRoot.prev - next := at.next - elem = &snapshotElement{ - seq: seq, - prev: at, - next: next, + + if e := db.snapsList.Back(); e != nil { + se := e.Value.(*snapshotElement) + if se.seq == seq { + se.ref++ + return se + } else if seq < se.seq { + panic("leveldb: sequence number is not increasing") } - at.next = elem - next.prev = elem } - elem.ref++ - db.snapsMu.Unlock() - return elem + se := &snapshotElement{seq: seq, ref: 1} + se.e = db.snapsList.PushBack(se) + return se } // Releases given snapshot element. -func (db *DB) releaseSnapshot(elem *snapshotElement) { - if !db.isClosed() { - db.snapsMu.Lock() - elem.ref-- - if elem.ref == 0 { - elem.prev.next = elem.next - elem.next.prev = elem.prev - elem.next = nil - elem.prev = nil - } else if elem.ref < 0 { - panic("leveldb: Snapshot: negative element reference") - } - db.snapsMu.Unlock() +func (db *DB) releaseSnapshot(se *snapshotElement) { + db.snapsMu.Lock() + defer db.snapsMu.Unlock() + + se.ref-- + if se.ref == 0 { + db.snapsList.Remove(se.e) + se.e = nil + } else if se.ref < 0 { + panic("leveldb: Snapshot: negative element reference") } } @@ -70,10 +63,11 @@ func (db *DB) releaseSnapshot(elem *snapshotElement) { func (db *DB) minSeq() uint64 { db.snapsMu.Lock() defer db.snapsMu.Unlock() - elem := db.snapsRoot.prev - if elem != &db.snapsRoot { - return elem.seq + + if e := db.snapsList.Front(); e != nil { + return e.Value.(*snapshotElement).seq } + return db.getSeq() } @@ -81,38 +75,59 @@ func (db *DB) minSeq() uint64 { type Snapshot struct { db *DB elem *snapshotElement - mu sync.Mutex + mu sync.RWMutex released bool } // Creates new snapshot object. func (db *DB) newSnapshot() *Snapshot { - p := &Snapshot{ + snap := &Snapshot{ db: db, elem: db.acquireSnapshot(), } - runtime.SetFinalizer(p, (*Snapshot).Release) - return p + atomic.AddInt32(&db.aliveSnaps, 1) + runtime.SetFinalizer(snap, (*Snapshot).Release) + return snap +} + +func (snap *Snapshot) String() string { + return fmt.Sprintf("leveldb.Snapshot{%d}", snap.elem.seq) } // Get gets the value for the given key. It returns ErrNotFound if -// the DB does not contain the key. +// the DB does not contains the key. // // The caller should not modify the contents of the returned slice, but // it is safe to modify the contents of the argument after Get returns. -func (p *Snapshot) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { - db := p.db - err = db.ok() +func (snap *Snapshot) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + err = snap.db.ok() if err != nil { return } - p.mu.Lock() - defer p.mu.Unlock() - if p.released { + snap.mu.RLock() + defer snap.mu.RUnlock() + if snap.released { err = ErrSnapshotReleased return } - return db.get(key, p.elem.seq, ro) + return snap.db.get(key, snap.elem.seq, ro) +} + +// Has returns true if the DB does contains the given key. +// +// It is safe to modify the contents of the argument after Get returns. +func (snap *Snapshot) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) { + err = snap.db.ok() + if err != nil { + return + } + snap.mu.RLock() + defer snap.mu.RUnlock() + if snap.released { + err = ErrSnapshotReleased + return + } + return snap.db.has(key, snap.elem.seq, ro) } // NewIterator returns an iterator for the snapshot of the uderlying DB. @@ -132,17 +147,18 @@ func (p *Snapshot) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error // iterator would be still valid until released. // // Also read Iterator documentation of the leveldb/iterator package. -func (p *Snapshot) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { - db := p.db - if err := db.ok(); err != nil { +func (snap *Snapshot) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + if err := snap.db.ok(); err != nil { return iterator.NewEmptyIterator(err) } - p.mu.Lock() - defer p.mu.Unlock() - if p.released { + snap.mu.Lock() + defer snap.mu.Unlock() + if snap.released { return iterator.NewEmptyIterator(ErrSnapshotReleased) } - return db.newIterator(p.elem.seq, slice, ro) + // Since iterator already hold version ref, it doesn't need to + // hold snapshot ref. + return snap.db.newIterator(snap.elem.seq, slice, ro) } // Release releases the snapshot. This will not release any returned @@ -150,16 +166,18 @@ func (p *Snapshot) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator. // underlying DB is closed. // // Other methods should not be called after the snapshot has been released. -func (p *Snapshot) Release() { - p.mu.Lock() - if !p.released { +func (snap *Snapshot) Release() { + snap.mu.Lock() + defer snap.mu.Unlock() + + if !snap.released { // Clear the finalizer. - runtime.SetFinalizer(p, nil) + runtime.SetFinalizer(snap, nil) - p.released = true - p.db.releaseSnapshot(p.elem) - p.db = nil - p.elem = nil + snap.released = true + snap.db.releaseSnapshot(snap.elem) + atomic.AddInt32(&snap.db.aliveSnaps, -1) + snap.db = nil + snap.elem = nil } - p.mu.Unlock() } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_state.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_state.go index a13706142..d4db9d6dd 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_state.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_state.go @@ -8,106 +8,203 @@ package leveldb import ( "sync/atomic" + "time" "github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/memdb" ) +type memDB struct { + db *DB + mdb *memdb.DB + ref int32 +} + +func (m *memDB) incref() { + atomic.AddInt32(&m.ref, 1) +} + +func (m *memDB) decref() { + if ref := atomic.AddInt32(&m.ref, -1); ref == 0 { + // Only put back memdb with std capacity. + if m.mdb.Capacity() == m.db.s.o.GetWriteBuffer() { + m.mdb.Reset() + m.db.mpoolPut(m.mdb) + } + m.db = nil + m.mdb = nil + } else if ref < 0 { + panic("negative memdb ref") + } +} + // Get latest sequence number. -func (d *DB) getSeq() uint64 { - return atomic.LoadUint64(&d.seq) +func (db *DB) getSeq() uint64 { + return atomic.LoadUint64(&db.seq) } // Atomically adds delta to seq. -func (d *DB) addSeq(delta uint64) { - atomic.AddUint64(&d.seq, delta) +func (db *DB) addSeq(delta uint64) { + atomic.AddUint64(&db.seq, delta) +} + +func (db *DB) sampleSeek(ikey iKey) { + v := db.s.version() + if v.sampleSeek(ikey) { + // Trigger table compaction. + db.compSendTrigger(db.tcompCmdC) + } + v.release() +} + +func (db *DB) mpoolPut(mem *memdb.DB) { + defer func() { + recover() + }() + select { + case db.memPool <- mem: + default: + } +} + +func (db *DB) mpoolGet() *memdb.DB { + select { + case mem := <-db.memPool: + return mem + default: + return nil + } +} + +func (db *DB) mpoolDrain() { + ticker := time.NewTicker(30 * time.Second) + for { + select { + case <-ticker.C: + select { + case <-db.memPool: + default: + } + case _, _ = <-db.closeC: + close(db.memPool) + return + } + } } // Create new memdb and froze the old one; need external synchronization. // newMem only called synchronously by the writer. -func (d *DB) newMem(n int) (mem *memdb.DB, err error) { - s := d.s - - num := s.allocFileNum() - file := s.getJournalFile(num) +func (db *DB) newMem(n int) (mem *memDB, err error) { + num := db.s.allocFileNum() + file := db.s.getJournalFile(num) w, err := file.Create() if err != nil { - s.reuseFileNum(num) + db.s.reuseFileNum(num) return } - d.memMu.Lock() - if d.journal == nil { - d.journal = journal.NewWriter(w) + + db.memMu.Lock() + defer db.memMu.Unlock() + + if db.frozenMem != nil { + panic("still has frozen mem") + } + + if db.journal == nil { + db.journal = journal.NewWriter(w) } else { - d.journal.Reset(w) - d.journalWriter.Close() - d.frozenJournalFile = d.journalFile - } - d.journalWriter = w - d.journalFile = file - d.frozenMem = d.mem - d.mem = memdb.New(s.icmp, maxInt(d.s.o.GetWriteBuffer(), n)) - mem = d.mem - // The seq only incremented by the writer. - d.frozenSeq = d.seq - d.memMu.Unlock() + db.journal.Reset(w) + db.journalWriter.Close() + db.frozenJournalFile = db.journalFile + } + db.journalWriter = w + db.journalFile = file + db.frozenMem = db.mem + mdb := db.mpoolGet() + if mdb == nil || mdb.Capacity() < n { + mdb = memdb.New(db.s.icmp, maxInt(db.s.o.GetWriteBuffer(), n)) + } + mem = &memDB{ + db: db, + mdb: mdb, + ref: 2, + } + db.mem = mem + // The seq only incremented by the writer. And whoever called newMem + // should hold write lock, so no need additional synchronization here. + db.frozenSeq = db.seq return } // Get all memdbs. -func (d *DB) getMems() (e *memdb.DB, f *memdb.DB) { - d.memMu.RLock() - defer d.memMu.RUnlock() - return d.mem, d.frozenMem +func (db *DB) getMems() (e, f *memDB) { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.mem == nil { + panic("nil effective mem") + } + db.mem.incref() + if db.frozenMem != nil { + db.frozenMem.incref() + } + return db.mem, db.frozenMem } // Get frozen memdb. -func (d *DB) getEffectiveMem() *memdb.DB { - d.memMu.RLock() - defer d.memMu.RUnlock() - return d.mem +func (db *DB) getEffectiveMem() *memDB { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.mem == nil { + panic("nil effective mem") + } + db.mem.incref() + return db.mem } // Check whether we has frozen memdb. -func (d *DB) hasFrozenMem() bool { - d.memMu.RLock() - defer d.memMu.RUnlock() - return d.frozenMem != nil +func (db *DB) hasFrozenMem() bool { + db.memMu.RLock() + defer db.memMu.RUnlock() + return db.frozenMem != nil } // Get frozen memdb. -func (d *DB) getFrozenMem() *memdb.DB { - d.memMu.RLock() - defer d.memMu.RUnlock() - return d.frozenMem +func (db *DB) getFrozenMem() *memDB { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.frozenMem != nil { + db.frozenMem.incref() + } + return db.frozenMem } // Drop frozen memdb; assume that frozen memdb isn't nil. -func (d *DB) dropFrozenMem() { - d.memMu.Lock() - if err := d.frozenJournalFile.Remove(); err != nil { - d.s.logf("journal@remove removing @%d %q", d.frozenJournalFile.Num(), err) +func (db *DB) dropFrozenMem() { + db.memMu.Lock() + if err := db.frozenJournalFile.Remove(); err != nil { + db.logf("journal@remove removing @%d %q", db.frozenJournalFile.Num(), err) } else { - d.s.logf("journal@remove removed @%d", d.frozenJournalFile.Num()) + db.logf("journal@remove removed @%d", db.frozenJournalFile.Num()) } - d.frozenJournalFile = nil - d.frozenMem = nil - d.memMu.Unlock() + db.frozenJournalFile = nil + db.frozenMem.decref() + db.frozenMem = nil + db.memMu.Unlock() } // Set closed flag; return true if not already closed. -func (d *DB) setClosed() bool { - return atomic.CompareAndSwapUint32(&d.closed, 0, 1) +func (db *DB) setClosed() bool { + return atomic.CompareAndSwapUint32(&db.closed, 0, 1) } // Check whether DB was closed. -func (d *DB) isClosed() bool { - return atomic.LoadUint32(&d.closed) != 0 +func (db *DB) isClosed() bool { + return atomic.LoadUint32(&db.closed) != 0 } // Check read ok status. -func (d *DB) ok() error { - if d.isClosed() { +func (db *DB) ok() error { + if db.isClosed() { return ErrClosed } return nil diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go index 5de7d9723..38bfbf1ea 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go @@ -7,6 +7,10 @@ package leveldb import ( + "bytes" + "container/list" + crand "crypto/rand" + "encoding/binary" "fmt" "math/rand" "os" @@ -20,6 +24,7 @@ import ( "unsafe" "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/opt" @@ -148,25 +153,29 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) { t := h.t db := h.db - var res uint64 + var ( + maxOverlaps uint64 + maxLevel int + ) v := db.s.version() for i, tt := range v.tables[1 : len(v.tables)-1] { level := i + 1 next := v.tables[level+1] for _, t := range tt { - var r tFiles - min, max := t.min.ukey(), t.max.ukey() - next.getOverlaps(min, max, &r, true, db.s.icmp.ucmp) + r := next.getOverlaps(nil, db.s.icmp, t.imin.ukey(), t.imax.ukey(), false) sum := r.size() - if sum > res { - res = sum + if sum > maxOverlaps { + maxOverlaps = sum + maxLevel = level } } } v.release() - if res > want { - t.Errorf("next level overlapping bytes is more than %d, got=%d", want, res) + if maxOverlaps > want { + t.Errorf("next level most overlapping bytes is more than %d, got=%d level=%d", want, maxOverlaps, maxLevel) + } else { + t.Logf("next level most overlapping bytes is %d, level=%d want=%d", maxOverlaps, maxLevel, want) } } @@ -239,7 +248,7 @@ func (h *dbHarness) allEntriesFor(key, want string) { db := h.db s := db.s - ikey := newIKey([]byte(key), kMaxSeq, tVal) + ikey := newIkey([]byte(key), kMaxSeq, ktVal) iter := db.newRawIterator(nil, nil) if !iter.Seek(ikey) && iter.Error() != nil { t.Error("AllEntries: error during seek, err: ", iter.Error()) @@ -248,19 +257,18 @@ func (h *dbHarness) allEntriesFor(key, want string) { res := "[ " first := true for iter.Valid() { - rkey := iKey(iter.Key()) - if _, t, ok := rkey.parseNum(); ok { - if s.icmp.uCompare(ikey.ukey(), rkey.ukey()) != 0 { + if ukey, _, kt, kerr := parseIkey(iter.Key()); kerr == nil { + if s.icmp.uCompare(ikey.ukey(), ukey) != 0 { break } if !first { res += ", " } first = false - switch t { - case tVal: + switch kt { + case ktVal: res += string(iter.Value()) - case tDel: + case ktDel: res += "DEL" } } else { @@ -325,6 +333,8 @@ func (h *dbHarness) compactMem() { t := h.t db := h.db + t.Log("starting memdb compaction") + db.writeLockC <- struct{}{} defer func() { <-db.writeLockC @@ -340,6 +350,8 @@ func (h *dbHarness) compactMem() { if h.totalTables() == 0 { t.Error("zero tables after mem compaction") } + + t.Log("memdb compaction done") } func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) { @@ -354,6 +366,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) _max = []byte(max) } + t.Logf("starting table range compaction: level=%d, min=%q, max=%q", level, min, max) + if err := db.compSendRange(db.tcompCmdC, level, _min, _max); err != nil { if wanterr { t.Log("CompactRangeAt: got error (expected): ", err) @@ -363,6 +377,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) } else if wanterr { t.Error("CompactRangeAt: expect error") } + + t.Log("table range compaction done") } func (h *dbHarness) compactRangeAt(level int, min, max string) { @@ -373,6 +389,8 @@ func (h *dbHarness) compactRange(min, max string) { t := h.t db := h.db + t.Logf("starting DB range compaction: min=%q, max=%q", min, max) + var r util.Range if min != "" { r.Start = []byte(min) @@ -383,21 +401,25 @@ func (h *dbHarness) compactRange(min, max string) { if err := db.CompactRange(r); err != nil { t.Error("CompactRange: got error: ", err) } -} -func (h *dbHarness) sizeAssert(start, limit string, low, hi uint64) { - t := h.t - db := h.db + t.Log("DB range compaction done") +} - s, err := db.SizeOf([]util.Range{ +func (h *dbHarness) sizeOf(start, limit string) uint64 { + sz, err := h.db.SizeOf([]util.Range{ {[]byte(start), []byte(limit)}, }) if err != nil { - t.Error("SizeOf: got error: ", err) + h.t.Error("SizeOf: got error: ", err) } - if s.Sum() < low || s.Sum() > hi { - t.Errorf("sizeof %q to %q not in range, want %d - %d, got %d", - shorten(start), shorten(limit), low, hi, s.Sum()) + return sz.Sum() +} + +func (h *dbHarness) sizeAssert(start, limit string, low, hi uint64) { + sz := h.sizeOf(start, limit) + if sz < low || sz > hi { + h.t.Errorf("sizeOf %q to %q not in range, want %d - %d, got %d", + shorten(start), shorten(limit), low, hi, sz) } } @@ -504,13 +526,13 @@ func Test_FieldsAligned(t *testing.T) { p1 := new(DB) testAligned(t, "DB.seq", unsafe.Offsetof(p1.seq)) p2 := new(session) - testAligned(t, "session.stFileNum", unsafe.Offsetof(p2.stFileNum)) + testAligned(t, "session.stNextFileNum", unsafe.Offsetof(p2.stNextFileNum)) testAligned(t, "session.stJournalNum", unsafe.Offsetof(p2.stJournalNum)) testAligned(t, "session.stPrevJournalNum", unsafe.Offsetof(p2.stPrevJournalNum)) - testAligned(t, "session.stSeq", unsafe.Offsetof(p2.stSeq)) + testAligned(t, "session.stSeqNum", unsafe.Offsetof(p2.stSeqNum)) } -func TestDb_Locking(t *testing.T) { +func TestDB_Locking(t *testing.T) { h := newDbHarness(t) defer h.stor.Close() h.openAssert(false) @@ -518,7 +540,7 @@ func TestDb_Locking(t *testing.T) { h.openAssert(true) } -func TestDb_Empty(t *testing.T) { +func TestDB_Empty(t *testing.T) { trun(t, func(h *dbHarness) { h.get("foo", false) @@ -527,7 +549,7 @@ func TestDb_Empty(t *testing.T) { }) } -func TestDb_ReadWrite(t *testing.T) { +func TestDB_ReadWrite(t *testing.T) { trun(t, func(h *dbHarness) { h.put("foo", "v1") h.getVal("foo", "v1") @@ -542,7 +564,7 @@ func TestDb_ReadWrite(t *testing.T) { }) } -func TestDb_PutDeleteGet(t *testing.T) { +func TestDB_PutDeleteGet(t *testing.T) { trun(t, func(h *dbHarness) { h.put("foo", "v1") h.getVal("foo", "v1") @@ -556,7 +578,7 @@ func TestDb_PutDeleteGet(t *testing.T) { }) } -func TestDb_EmptyBatch(t *testing.T) { +func TestDB_EmptyBatch(t *testing.T) { h := newDbHarness(t) defer h.close() @@ -568,7 +590,7 @@ func TestDb_EmptyBatch(t *testing.T) { h.get("foo", false) } -func TestDb_GetFromFrozen(t *testing.T) { +func TestDB_GetFromFrozen(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 100100}) defer h.close() @@ -594,7 +616,7 @@ func TestDb_GetFromFrozen(t *testing.T) { h.get("k2", true) } -func TestDb_GetFromTable(t *testing.T) { +func TestDB_GetFromTable(t *testing.T) { trun(t, func(h *dbHarness) { h.put("foo", "v1") h.compactMem() @@ -602,7 +624,7 @@ func TestDb_GetFromTable(t *testing.T) { }) } -func TestDb_GetSnapshot(t *testing.T) { +func TestDB_GetSnapshot(t *testing.T) { trun(t, func(h *dbHarness) { bar := strings.Repeat("b", 200) h.put("foo", "v1") @@ -636,7 +658,7 @@ func TestDb_GetSnapshot(t *testing.T) { }) } -func TestDb_GetLevel0Ordering(t *testing.T) { +func TestDB_GetLevel0Ordering(t *testing.T) { trun(t, func(h *dbHarness) { for i := 0; i < 4; i++ { h.put("bar", fmt.Sprintf("b%d", i)) @@ -659,7 +681,7 @@ func TestDb_GetLevel0Ordering(t *testing.T) { }) } -func TestDb_GetOrderedByLevels(t *testing.T) { +func TestDB_GetOrderedByLevels(t *testing.T) { trun(t, func(h *dbHarness) { h.put("foo", "v1") h.compactMem() @@ -671,7 +693,7 @@ func TestDb_GetOrderedByLevels(t *testing.T) { }) } -func TestDb_GetPicksCorrectFile(t *testing.T) { +func TestDB_GetPicksCorrectFile(t *testing.T) { trun(t, func(h *dbHarness) { // Arrange to have multiple files in a non-level-0 level. h.put("a", "va") @@ -695,7 +717,7 @@ func TestDb_GetPicksCorrectFile(t *testing.T) { }) } -func TestDb_GetEncountersEmptyLevel(t *testing.T) { +func TestDB_GetEncountersEmptyLevel(t *testing.T) { trun(t, func(h *dbHarness) { // Arrange for the following to happen: // * sstable A in level 0 @@ -750,7 +772,7 @@ func TestDb_GetEncountersEmptyLevel(t *testing.T) { }) } -func TestDb_IterMultiWithDelete(t *testing.T) { +func TestDB_IterMultiWithDelete(t *testing.T) { trun(t, func(h *dbHarness) { h.put("a", "va") h.put("b", "vb") @@ -776,7 +798,7 @@ func TestDb_IterMultiWithDelete(t *testing.T) { }) } -func TestDb_IteratorPinsRef(t *testing.T) { +func TestDB_IteratorPinsRef(t *testing.T) { h := newDbHarness(t) defer h.close() @@ -800,7 +822,7 @@ func TestDb_IteratorPinsRef(t *testing.T) { iter.Release() } -func TestDb_Recover(t *testing.T) { +func TestDB_Recover(t *testing.T) { trun(t, func(h *dbHarness) { h.put("foo", "v1") h.put("baz", "v5") @@ -822,7 +844,7 @@ func TestDb_Recover(t *testing.T) { }) } -func TestDb_RecoverWithEmptyJournal(t *testing.T) { +func TestDB_RecoverWithEmptyJournal(t *testing.T) { trun(t, func(h *dbHarness) { h.put("foo", "v1") h.put("foo", "v2") @@ -836,7 +858,7 @@ func TestDb_RecoverWithEmptyJournal(t *testing.T) { }) } -func TestDb_RecoverDuringMemtableCompaction(t *testing.T) { +func TestDB_RecoverDuringMemtableCompaction(t *testing.T) { truno(t, &opt.Options{WriteBuffer: 1000000}, func(h *dbHarness) { h.stor.DelaySync(storage.TypeTable) @@ -852,7 +874,7 @@ func TestDb_RecoverDuringMemtableCompaction(t *testing.T) { }) } -func TestDb_MinorCompactionsHappen(t *testing.T) { +func TestDB_MinorCompactionsHappen(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 10000}) defer h.close() @@ -876,7 +898,7 @@ func TestDb_MinorCompactionsHappen(t *testing.T) { } } -func TestDb_RecoverWithLargeJournal(t *testing.T) { +func TestDB_RecoverWithLargeJournal(t *testing.T) { h := newDbHarness(t) defer h.close() @@ -901,7 +923,7 @@ func TestDb_RecoverWithLargeJournal(t *testing.T) { v.release() } -func TestDb_CompactionsGenerateMultipleFiles(t *testing.T) { +func TestDB_CompactionsGenerateMultipleFiles(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{ WriteBuffer: 10000000, Compression: opt.NoCompression, @@ -939,11 +961,11 @@ func TestDb_CompactionsGenerateMultipleFiles(t *testing.T) { } } -func TestDb_RepeatedWritesToSameKey(t *testing.T) { +func TestDB_RepeatedWritesToSameKey(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 100000}) defer h.close() - maxTables := kNumLevels + kL0_StopWritesTrigger + maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger() value := strings.Repeat("v", 2*h.o.GetWriteBuffer()) for i := 0; i < 5*maxTables; i++ { @@ -955,13 +977,13 @@ func TestDb_RepeatedWritesToSameKey(t *testing.T) { } } -func TestDb_RepeatedWritesToSameKeyAfterReopen(t *testing.T) { +func TestDB_RepeatedWritesToSameKeyAfterReopen(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 100000}) defer h.close() h.reopenDB() - maxTables := kNumLevels + kL0_StopWritesTrigger + maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger() value := strings.Repeat("v", 2*h.o.GetWriteBuffer()) for i := 0; i < 5*maxTables; i++ { @@ -973,11 +995,11 @@ func TestDb_RepeatedWritesToSameKeyAfterReopen(t *testing.T) { } } -func TestDb_SparseMerge(t *testing.T) { +func TestDB_SparseMerge(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{Compression: opt.NoCompression}) defer h.close() - h.putMulti(kNumLevels, "A", "Z") + h.putMulti(h.o.GetNumLevel(), "A", "Z") // Suppose there is: // small amount of data with prefix A @@ -1001,6 +1023,7 @@ func TestDb_SparseMerge(t *testing.T) { h.put("C", "vc2") h.compactMem() + h.waitCompaction() h.maxNextLevelOverlappingBytes(20 * 1048576) h.compactRangeAt(0, "", "") h.waitCompaction() @@ -1010,7 +1033,7 @@ func TestDb_SparseMerge(t *testing.T) { h.maxNextLevelOverlappingBytes(20 * 1048576) } -func TestDb_SizeOf(t *testing.T) { +func TestDB_SizeOf(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{ Compression: opt.NoCompression, WriteBuffer: 10000000, @@ -1060,7 +1083,7 @@ func TestDb_SizeOf(t *testing.T) { } } -func TestDb_SizeOf_MixOfSmallAndLarge(t *testing.T) { +func TestDB_SizeOf_MixOfSmallAndLarge(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{Compression: opt.NoCompression}) defer h.close() @@ -1098,7 +1121,7 @@ func TestDb_SizeOf_MixOfSmallAndLarge(t *testing.T) { } } -func TestDb_Snapshot(t *testing.T) { +func TestDB_Snapshot(t *testing.T) { trun(t, func(h *dbHarness) { h.put("foo", "v1") s1 := h.getSnapshot() @@ -1127,13 +1150,51 @@ func TestDb_Snapshot(t *testing.T) { }) } -func TestDb_HiddenValuesAreRemoved(t *testing.T) { +func TestDB_SnapshotList(t *testing.T) { + db := &DB{snapsList: list.New()} + e0a := db.acquireSnapshot() + e0b := db.acquireSnapshot() + db.seq = 1 + e1 := db.acquireSnapshot() + db.seq = 2 + e2 := db.acquireSnapshot() + + if db.minSeq() != 0 { + t.Fatalf("invalid sequence number, got=%d", db.minSeq()) + } + db.releaseSnapshot(e0a) + if db.minSeq() != 0 { + t.Fatalf("invalid sequence number, got=%d", db.minSeq()) + } + db.releaseSnapshot(e2) + if db.minSeq() != 0 { + t.Fatalf("invalid sequence number, got=%d", db.minSeq()) + } + db.releaseSnapshot(e0b) + if db.minSeq() != 1 { + t.Fatalf("invalid sequence number, got=%d", db.minSeq()) + } + e2 = db.acquireSnapshot() + if db.minSeq() != 1 { + t.Fatalf("invalid sequence number, got=%d", db.minSeq()) + } + db.releaseSnapshot(e1) + if db.minSeq() != 2 { + t.Fatalf("invalid sequence number, got=%d", db.minSeq()) + } + db.releaseSnapshot(e2) + if db.minSeq() != 2 { + t.Fatalf("invalid sequence number, got=%d", db.minSeq()) + } +} + +func TestDB_HiddenValuesAreRemoved(t *testing.T) { trun(t, func(h *dbHarness) { s := h.db.s h.put("foo", "v1") h.compactMem() - m := kMaxMemCompactLevel + m := h.o.GetMaxMemCompationLevel() v := s.version() num := v.tLen(m) v.release() @@ -1170,14 +1231,14 @@ func TestDb_HiddenValuesAreRemoved(t *testing.T) { }) } -func TestDb_DeletionMarkers2(t *testing.T) { +func TestDB_DeletionMarkers2(t *testing.T) { h := newDbHarness(t) defer h.close() s := h.db.s h.put("foo", "v1") h.compactMem() - m := kMaxMemCompactLevel + m := h.o.GetMaxMemCompationLevel() v := s.version() num := v.tLen(m) v.release() @@ -1211,8 +1272,8 @@ func TestDb_DeletionMarkers2(t *testing.T) { h.allEntriesFor("foo", "[ ]") } -func TestDb_CompactionTableOpenError(t *testing.T) { - h := newDbHarnessWopt(t, &opt.Options{MaxOpenFiles: 0}) +func TestDB_CompactionTableOpenError(t *testing.T) { + h := newDbHarnessWopt(t, &opt.Options{OpenFilesCacheCapacity: -1}) defer h.close() im := 10 @@ -1230,14 +1291,14 @@ func TestDb_CompactionTableOpenError(t *testing.T) { t.Errorf("total tables is %d, want %d", n, im) } - h.stor.SetOpenErr(storage.TypeTable) + h.stor.SetEmuErr(storage.TypeTable, tsOpOpen) go h.db.CompactRange(util.Range{}) if err := h.db.compSendIdle(h.db.tcompCmdC); err != nil { t.Log("compaction error: ", err) } h.closeDB0() h.openDB() - h.stor.SetOpenErr(0) + h.stor.SetEmuErr(0, tsOpOpen) for i := 0; i < im; i++ { for j := 0; j < jm; j++ { @@ -1246,9 +1307,9 @@ func TestDb_CompactionTableOpenError(t *testing.T) { } } -func TestDb_OverlapInLevel0(t *testing.T) { +func TestDB_OverlapInLevel0(t *testing.T) { trun(t, func(h *dbHarness) { - if kMaxMemCompactLevel != 2 { + if h.o.GetMaxMemCompationLevel() != 2 { t.Fatal("fix test to reflect the config") } @@ -1289,7 +1350,7 @@ func TestDb_OverlapInLevel0(t *testing.T) { }) } -func TestDb_L0_CompactionBug_Issue44_a(t *testing.T) { +func TestDB_L0_CompactionBug_Issue44_a(t *testing.T) { h := newDbHarness(t) defer h.close() @@ -1309,7 +1370,7 @@ func TestDb_L0_CompactionBug_Issue44_a(t *testing.T) { h.getKeyVal("(a->v)") } -func TestDb_L0_CompactionBug_Issue44_b(t *testing.T) { +func TestDB_L0_CompactionBug_Issue44_b(t *testing.T) { h := newDbHarness(t) defer h.close() @@ -1338,7 +1399,7 @@ func TestDb_L0_CompactionBug_Issue44_b(t *testing.T) { h.getKeyVal("(->)(c->cv)") } -func TestDb_SingleEntryMemCompaction(t *testing.T) { +func TestDB_SingleEntryMemCompaction(t *testing.T) { trun(t, func(h *dbHarness) { for i := 0; i < 10; i++ { h.put("big", strings.Repeat("v", opt.DefaultWriteBuffer)) @@ -1355,7 +1416,7 @@ func TestDb_SingleEntryMemCompaction(t *testing.T) { }) } -func TestDb_ManifestWriteError(t *testing.T) { +func TestDB_ManifestWriteError(t *testing.T) { for i := 0; i < 2; i++ { func() { h := newDbHarness(t) @@ -1368,23 +1429,23 @@ func TestDb_ManifestWriteError(t *testing.T) { h.compactMem() h.getVal("foo", "bar") v := h.db.s.version() - if n := v.tLen(kMaxMemCompactLevel); n != 1 { + if n := v.tLen(h.o.GetMaxMemCompationLevel()); n != 1 { t.Errorf("invalid total tables, want=1 got=%d", n) } v.release() if i == 0 { - h.stor.SetWriteErr(storage.TypeManifest) + h.stor.SetEmuErr(storage.TypeManifest, tsOpWrite) } else { - h.stor.SetSyncErr(storage.TypeManifest) + h.stor.SetEmuErr(storage.TypeManifest, tsOpSync) } // Merging compaction (will fail) - h.compactRangeAtErr(kMaxMemCompactLevel, "", "", true) + h.compactRangeAtErr(h.o.GetMaxMemCompationLevel(), "", "", true) h.db.Close() - h.stor.SetWriteErr(0) - h.stor.SetSyncErr(0) + h.stor.SetEmuErr(0, tsOpWrite) + h.stor.SetEmuErr(0, tsOpSync) // Should not lose data h.openDB() @@ -1405,7 +1466,7 @@ func assertErr(t *testing.T, err error, wanterr bool) { } } -func TestDb_ClosedIsClosed(t *testing.T) { +func TestDB_ClosedIsClosed(t *testing.T) { h := newDbHarness(t) db := h.db @@ -1500,7 +1561,7 @@ func (p numberComparer) Compare(a, b []byte) int { func (numberComparer) Separator(dst, a, b []byte) []byte { return nil } func (numberComparer) Successor(dst, b []byte) []byte { return nil } -func TestDb_CustomComparer(t *testing.T) { +func TestDB_CustomComparer(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{ Comparer: numberComparer{}, WriteBuffer: 1000, @@ -1530,11 +1591,11 @@ func TestDb_CustomComparer(t *testing.T) { } } -func TestDb_ManualCompaction(t *testing.T) { +func TestDB_ManualCompaction(t *testing.T) { h := newDbHarness(t) defer h.close() - if kMaxMemCompactLevel != 2 { + if h.o.GetMaxMemCompationLevel() != 2 { t.Fatal("fix test to reflect the config") } @@ -1568,10 +1629,10 @@ func TestDb_ManualCompaction(t *testing.T) { h.tablesPerLevel("0,0,1") } -func TestDb_BloomFilter(t *testing.T) { +func TestDB_BloomFilter(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{ - BlockCache: opt.NoCache, - Filter: filter.NewBloomFilter(10), + DisableBlockCache: true, + Filter: filter.NewBloomFilter(10), }) defer h.close() @@ -1579,7 +1640,7 @@ func TestDb_BloomFilter(t *testing.T) { return fmt.Sprintf("key%06d", i) } - n := 10000 + const n = 10000 // Populate multiple layers for i := 0; i < n; i++ { @@ -1621,7 +1682,7 @@ func TestDb_BloomFilter(t *testing.T) { h.stor.ReleaseSync(storage.TypeTable) } -func TestDb_Concurrent(t *testing.T) { +func TestDB_Concurrent(t *testing.T) { const n, secs, maxkey = 4, 2, 1000 runtime.GOMAXPROCS(n) @@ -1686,7 +1747,7 @@ func TestDb_Concurrent(t *testing.T) { runtime.GOMAXPROCS(1) } -func TestDb_Concurrent2(t *testing.T) { +func TestDB_Concurrent2(t *testing.T) { const n, n2 = 4, 4000 runtime.GOMAXPROCS(n*2 + 2) @@ -1757,7 +1818,7 @@ func TestDb_Concurrent2(t *testing.T) { runtime.GOMAXPROCS(1) } -func TestDb_CreateReopenDbOnFile(t *testing.T) { +func TestDB_CreateReopenDbOnFile(t *testing.T) { dbpath := filepath.Join(os.TempDir(), fmt.Sprintf("goleveldbtestCreateReopenDbOnFile-%d", os.Getuid())) if err := os.RemoveAll(dbpath); err != nil { t.Fatal("cannot remove old db: ", err) @@ -1785,7 +1846,7 @@ func TestDb_CreateReopenDbOnFile(t *testing.T) { } } -func TestDb_CreateReopenDbOnFile2(t *testing.T) { +func TestDB_CreateReopenDbOnFile2(t *testing.T) { dbpath := filepath.Join(os.TempDir(), fmt.Sprintf("goleveldbtestCreateReopenDbOnFile2-%d", os.Getuid())) if err := os.RemoveAll(dbpath); err != nil { t.Fatal("cannot remove old db: ", err) @@ -1806,7 +1867,7 @@ func TestDb_CreateReopenDbOnFile2(t *testing.T) { } } -func TestDb_DeletionMarkersOnMemdb(t *testing.T) { +func TestDB_DeletionMarkersOnMemdb(t *testing.T) { h := newDbHarness(t) defer h.close() @@ -1817,8 +1878,8 @@ func TestDb_DeletionMarkersOnMemdb(t *testing.T) { h.getKeyVal("") } -func TestDb_LeveldbIssue178(t *testing.T) { - nKeys := (kMaxTableSize / 30) * 5 +func TestDB_LeveldbIssue178(t *testing.T) { + nKeys := (opt.DefaultCompactionTableSize / 30) * 5 key1 := func(i int) string { return fmt.Sprintf("my_key_%d", i) } @@ -1860,7 +1921,7 @@ func TestDb_LeveldbIssue178(t *testing.T) { h.assertNumKeys(nKeys) } -func TestDb_LeveldbIssue200(t *testing.T) { +func TestDB_LeveldbIssue200(t *testing.T) { h := newDbHarness(t) defer h.close() @@ -1886,3 +1947,719 @@ func TestDb_LeveldbIssue200(t *testing.T) { iter.Next() assertBytes(t, []byte("5"), iter.Key()) } + +func TestDB_GoleveldbIssue74(t *testing.T) { + h := newDbHarnessWopt(t, &opt.Options{ + WriteBuffer: 1 * opt.MiB, + }) + defer h.close() + + const n, dur = 10000, 5 * time.Second + + runtime.GOMAXPROCS(runtime.NumCPU()) + + until := time.Now().Add(dur) + wg := new(sync.WaitGroup) + wg.Add(2) + var done uint32 + go func() { + var i int + defer func() { + t.Logf("WRITER DONE #%d", i) + atomic.StoreUint32(&done, 1) + wg.Done() + }() + + b := new(Batch) + for ; time.Now().Before(until) && atomic.LoadUint32(&done) == 0; i++ { + iv := fmt.Sprintf("VAL%010d", i) + for k := 0; k < n; k++ { + key := fmt.Sprintf("KEY%06d", k) + b.Put([]byte(key), []byte(key+iv)) + b.Put([]byte(fmt.Sprintf("PTR%06d", k)), []byte(key)) + } + h.write(b) + + b.Reset() + snap := h.getSnapshot() + iter := snap.NewIterator(util.BytesPrefix([]byte("PTR")), nil) + var k int + for ; iter.Next(); k++ { + ptrKey := iter.Key() + key := iter.Value() + + if _, err := snap.Get(ptrKey, nil); err != nil { + t.Fatalf("WRITER #%d snapshot.Get %q: %v", i, ptrKey, err) + } + if value, err := snap.Get(key, nil); err != nil { + t.Fatalf("WRITER #%d snapshot.Get %q: %v", i, key, err) + } else if string(value) != string(key)+iv { + t.Fatalf("WRITER #%d snapshot.Get %q got invalid value, want %q got %q", i, key, string(key)+iv, value) + } + + b.Delete(key) + b.Delete(ptrKey) + } + h.write(b) + iter.Release() + snap.Release() + if k != n { + t.Fatalf("#%d %d != %d", i, k, n) + } + } + }() + go func() { + var i int + defer func() { + t.Logf("READER DONE #%d", i) + atomic.StoreUint32(&done, 1) + wg.Done() + }() + for ; time.Now().Before(until) && atomic.LoadUint32(&done) == 0; i++ { + snap := h.getSnapshot() + iter := snap.NewIterator(util.BytesPrefix([]byte("PTR")), nil) + var prevValue string + var k int + for ; iter.Next(); k++ { + ptrKey := iter.Key() + key := iter.Value() + + if _, err := snap.Get(ptrKey, nil); err != nil { + t.Fatalf("READER #%d snapshot.Get %q: %v", i, ptrKey, err) + } + + if value, err := snap.Get(key, nil); err != nil { + t.Fatalf("READER #%d snapshot.Get %q: %v", i, key, err) + } else if prevValue != "" && string(value) != string(key)+prevValue { + t.Fatalf("READER #%d snapshot.Get %q got invalid value, want %q got %q", i, key, string(key)+prevValue, value) + } else { + prevValue = string(value[len(key):]) + } + } + iter.Release() + snap.Release() + if k > 0 && k != n { + t.Fatalf("#%d %d != %d", i, k, n) + } + } + }() + wg.Wait() +} + +func TestDB_GetProperties(t *testing.T) { + h := newDbHarness(t) + defer h.close() + + _, err := h.db.GetProperty("leveldb.num-files-at-level") + if err == nil { + t.Error("GetProperty() failed to detect missing level") + } + + _, err = h.db.GetProperty("leveldb.num-files-at-level0") + if err != nil { + t.Error("got unexpected error", err) + } + + _, err = h.db.GetProperty("leveldb.num-files-at-level0x") + if err == nil { + t.Error("GetProperty() failed to detect invalid level") + } +} + +func TestDB_GoleveldbIssue72and83(t *testing.T) { + h := newDbHarnessWopt(t, &opt.Options{ + WriteBuffer: 1 * opt.MiB, + OpenFilesCacheCapacity: 3, + }) + defer h.close() + + const n, wn, dur = 10000, 100, 30 * time.Second + + runtime.GOMAXPROCS(runtime.NumCPU()) + + randomData := func(prefix byte, i int) []byte { + data := make([]byte, 1+4+32+64+32) + _, err := crand.Reader.Read(data[1 : len(data)-8]) + if err != nil { + panic(err) + } + data[0] = prefix + binary.LittleEndian.PutUint32(data[len(data)-8:], uint32(i)) + binary.LittleEndian.PutUint32(data[len(data)-4:], util.NewCRC(data[:len(data)-4]).Value()) + return data + } + + keys := make([][]byte, n) + for i := range keys { + keys[i] = randomData(1, 0) + } + + until := time.Now().Add(dur) + wg := new(sync.WaitGroup) + wg.Add(3) + var done uint32 + go func() { + i := 0 + defer func() { + t.Logf("WRITER DONE #%d", i) + wg.Done() + }() + + b := new(Batch) + for ; i < wn && atomic.LoadUint32(&done) == 0; i++ { + b.Reset() + for _, k1 := range keys { + k2 := randomData(2, i) + b.Put(k2, randomData(42, i)) + b.Put(k1, k2) + } + if err := h.db.Write(b, h.wo); err != nil { + atomic.StoreUint32(&done, 1) + t.Fatalf("WRITER #%d db.Write: %v", i, err) + } + } + }() + go func() { + var i int + defer func() { + t.Logf("READER0 DONE #%d", i) + atomic.StoreUint32(&done, 1) + wg.Done() + }() + for ; time.Now().Before(until) && atomic.LoadUint32(&done) == 0; i++ { + snap := h.getSnapshot() + seq := snap.elem.seq + if seq == 0 { + snap.Release() + continue + } + iter := snap.NewIterator(util.BytesPrefix([]byte{1}), nil) + writei := int(seq/(n*2) - 1) + var k int + for ; iter.Next(); k++ { + k1 := iter.Key() + k2 := iter.Value() + k1checksum0 := binary.LittleEndian.Uint32(k1[len(k1)-4:]) + k1checksum1 := util.NewCRC(k1[:len(k1)-4]).Value() + if k1checksum0 != k1checksum1 { + t.Fatalf("READER0 #%d.%d W#%d invalid K1 checksum: %#x != %#x", i, k, k1checksum0, k1checksum0) + } + k2checksum0 := binary.LittleEndian.Uint32(k2[len(k2)-4:]) + k2checksum1 := util.NewCRC(k2[:len(k2)-4]).Value() + if k2checksum0 != k2checksum1 { + t.Fatalf("READER0 #%d.%d W#%d invalid K2 checksum: %#x != %#x", i, k, k2checksum0, k2checksum1) + } + kwritei := int(binary.LittleEndian.Uint32(k2[len(k2)-8:])) + if writei != kwritei { + t.Fatalf("READER0 #%d.%d W#%d invalid write iteration num: %d", i, k, writei, kwritei) + } + if _, err := snap.Get(k2, nil); err != nil { + t.Fatalf("READER0 #%d.%d W#%d snap.Get: %v\nk1: %x\n -> k2: %x", i, k, writei, err, k1, k2) + } + } + if err := iter.Error(); err != nil { + t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, writei, err) + } + iter.Release() + snap.Release() + if k > 0 && k != n { + t.Fatalf("READER0 #%d W#%d short read, got=%d want=%d", i, writei, k, n) + } + } + }() + go func() { + var i int + defer func() { + t.Logf("READER1 DONE #%d", i) + atomic.StoreUint32(&done, 1) + wg.Done() + }() + for ; time.Now().Before(until) && atomic.LoadUint32(&done) == 0; i++ { + iter := h.db.NewIterator(nil, nil) + seq := iter.(*dbIter).seq + if seq == 0 { + iter.Release() + continue + } + writei := int(seq/(n*2) - 1) + var k int + for ok := iter.Last(); ok; ok = iter.Prev() { + k++ + } + if err := iter.Error(); err != nil { + t.Fatalf("READER1 #%d.%d W#%d db.Iterator: %v", i, k, writei, err) + } + iter.Release() + if m := (writei+1)*n + n; k != m { + t.Fatalf("READER1 #%d W#%d short read, got=%d want=%d", i, writei, k, m) + } + } + }() + + wg.Wait() +} + +func TestDB_TransientError(t *testing.T) { + h := newDbHarnessWopt(t, &opt.Options{ + WriteBuffer: 128 * opt.KiB, + OpenFilesCacheCapacity: 3, + DisableCompactionBackoff: true, + }) + defer h.close() + + const ( + nSnap = 20 + nKey = 10000 + ) + + var ( + snaps [nSnap]*Snapshot + b = &Batch{} + ) + for i := range snaps { + vtail := fmt.Sprintf("VAL%030d", i) + b.Reset() + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%8d", k) + b.Put([]byte(key), []byte(key+vtail)) + } + h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt) + if err := h.db.Write(b, nil); err != nil { + t.Logf("WRITE #%d error: %v", i, err) + h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt, tsOpWrite) + for { + if err := h.db.Write(b, nil); err == nil { + break + } else if errors.IsCorrupted(err) { + t.Fatalf("WRITE #%d corrupted: %v", i, err) + } + } + } + + snaps[i] = h.db.newSnapshot() + b.Reset() + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%8d", k) + b.Delete([]byte(key)) + } + h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt) + if err := h.db.Write(b, nil); err != nil { + t.Logf("WRITE #%d error: %v", i, err) + h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt) + for { + if err := h.db.Write(b, nil); err == nil { + break + } else if errors.IsCorrupted(err) { + t.Fatalf("WRITE #%d corrupted: %v", i, err) + } + } + } + } + h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt) + + runtime.GOMAXPROCS(runtime.NumCPU()) + + rnd := rand.New(rand.NewSource(0xecafdaed)) + wg := &sync.WaitGroup{} + for i, snap := range snaps { + wg.Add(2) + + go func(i int, snap *Snapshot, sk []int) { + defer wg.Done() + + vtail := fmt.Sprintf("VAL%030d", i) + for _, k := range sk { + key := fmt.Sprintf("KEY%8d", k) + xvalue, err := snap.Get([]byte(key), nil) + if err != nil { + t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err) + } + value := key + vtail + if !bytes.Equal([]byte(value), xvalue) { + t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue) + } + } + }(i, snap, rnd.Perm(nKey)) + + go func(i int, snap *Snapshot) { + defer wg.Done() + + vtail := fmt.Sprintf("VAL%030d", i) + iter := snap.NewIterator(nil, nil) + defer iter.Release() + for k := 0; k < nKey; k++ { + if !iter.Next() { + if err := iter.Error(); err != nil { + t.Fatalf("READER_ITER #%d K%d error: %v", i, k, err) + } else { + t.Fatalf("READER_ITER #%d K%d eoi", i, k) + } + } + key := fmt.Sprintf("KEY%8d", k) + xkey := iter.Key() + if !bytes.Equal([]byte(key), xkey) { + t.Fatalf("READER_ITER #%d K%d invalid key: want %q, got %q", i, k, key, xkey) + } + value := key + vtail + xvalue := iter.Value() + if !bytes.Equal([]byte(value), xvalue) { + t.Fatalf("READER_ITER #%d K%d invalid value: want %q, got %q", i, k, value, xvalue) + } + } + }(i, snap) + } + + wg.Wait() +} + +func TestDB_UkeyShouldntHopAcrossTable(t *testing.T) { + h := newDbHarnessWopt(t, &opt.Options{ + WriteBuffer: 112 * opt.KiB, + CompactionTableSize: 90 * opt.KiB, + CompactionExpandLimitFactor: 1, + }) + defer h.close() + + const ( + nSnap = 190 + nKey = 140 + ) + + var ( + snaps [nSnap]*Snapshot + b = &Batch{} + ) + for i := range snaps { + vtail := fmt.Sprintf("VAL%030d", i) + b.Reset() + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%08d", k) + b.Put([]byte(key), []byte(key+vtail)) + } + if err := h.db.Write(b, nil); err != nil { + t.Fatalf("WRITE #%d error: %v", i, err) + } + + snaps[i] = h.db.newSnapshot() + b.Reset() + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%08d", k) + b.Delete([]byte(key)) + } + if err := h.db.Write(b, nil); err != nil { + t.Fatalf("WRITE #%d error: %v", i, err) + } + } + + h.compactMem() + + h.waitCompaction() + for level, tables := range h.db.s.stVersion.tables { + for _, table := range tables { + t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax) + } + } + + h.compactRangeAt(0, "", "") + h.waitCompaction() + for level, tables := range h.db.s.stVersion.tables { + for _, table := range tables { + t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax) + } + } + h.compactRangeAt(1, "", "") + h.waitCompaction() + for level, tables := range h.db.s.stVersion.tables { + for _, table := range tables { + t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax) + } + } + runtime.GOMAXPROCS(runtime.NumCPU()) + + wg := &sync.WaitGroup{} + for i, snap := range snaps { + wg.Add(1) + + go func(i int, snap *Snapshot) { + defer wg.Done() + + vtail := fmt.Sprintf("VAL%030d", i) + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%08d", k) + xvalue, err := snap.Get([]byte(key), nil) + if err != nil { + t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err) + } + value := key + vtail + if !bytes.Equal([]byte(value), xvalue) { + t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue) + } + } + }(i, snap) + } + + wg.Wait() +} + +func TestDB_TableCompactionBuilder(t *testing.T) { + stor := newTestStorage(t) + defer stor.Close() + + const nSeq = 99 + + o := &opt.Options{ + WriteBuffer: 112 * opt.KiB, + CompactionTableSize: 43 * opt.KiB, + CompactionExpandLimitFactor: 1, + CompactionGPOverlapsFactor: 1, + DisableBlockCache: true, + } + s, err := newSession(stor, o) + if err != nil { + t.Fatal(err) + } + if err := s.create(); err != nil { + t.Fatal(err) + } + defer s.close() + var ( + seq uint64 + targetSize = 5 * o.CompactionTableSize + value = bytes.Repeat([]byte{'0'}, 100) + ) + for i := 0; i < 2; i++ { + tw, err := s.tops.create() + if err != nil { + t.Fatal(err) + } + for k := 0; tw.tw.BytesLen() < targetSize; k++ { + key := []byte(fmt.Sprintf("%09d", k)) + seq += nSeq - 1 + for x := uint64(0); x < nSeq; x++ { + if err := tw.append(newIkey(key, seq-x, ktVal), value); err != nil { + t.Fatal(err) + } + } + } + tf, err := tw.finish() + if err != nil { + t.Fatal(err) + } + rec := &sessionRecord{numLevel: s.o.GetNumLevel()} + rec.addTableFile(i, tf) + if err := s.commit(rec); err != nil { + t.Fatal(err) + } + } + + // Build grandparent. + v := s.version() + c := newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...)) + rec := &sessionRecord{numLevel: s.o.GetNumLevel()} + b := &tableCompactionBuilder{ + s: s, + c: c, + rec: rec, + stat1: new(cStatsStaging), + minSeq: 0, + strict: true, + tableSize: o.CompactionTableSize/3 + 961, + } + if err := b.run(new(compactionTransactCounter)); err != nil { + t.Fatal(err) + } + for _, t := range c.tables[0] { + rec.delTable(c.level, t.file.Num()) + } + if err := s.commit(rec); err != nil { + t.Fatal(err) + } + c.release() + + // Build level-1. + v = s.version() + c = newCompaction(s, v, 0, append(tFiles{}, v.tables[0]...)) + rec = &sessionRecord{numLevel: s.o.GetNumLevel()} + b = &tableCompactionBuilder{ + s: s, + c: c, + rec: rec, + stat1: new(cStatsStaging), + minSeq: 0, + strict: true, + tableSize: o.CompactionTableSize, + } + if err := b.run(new(compactionTransactCounter)); err != nil { + t.Fatal(err) + } + for _, t := range c.tables[0] { + rec.delTable(c.level, t.file.Num()) + } + // Move grandparent to level-3 + for _, t := range v.tables[2] { + rec.delTable(2, t.file.Num()) + rec.addTableFile(3, t) + } + if err := s.commit(rec); err != nil { + t.Fatal(err) + } + c.release() + + v = s.version() + for level, want := range []bool{false, true, false, true, false} { + got := len(v.tables[level]) > 0 + if want != got { + t.Fatalf("invalid level-%d tables len: want %v, got %v", level, want, got) + } + } + for i, f := range v.tables[1][:len(v.tables[1])-1] { + nf := v.tables[1][i+1] + if bytes.Equal(f.imax.ukey(), nf.imin.ukey()) { + t.Fatalf("KEY %q hop across table %d .. %d", f.imax.ukey(), f.file.Num(), nf.file.Num()) + } + } + v.release() + + // Compaction with transient error. + v = s.version() + c = newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...)) + rec = &sessionRecord{numLevel: s.o.GetNumLevel()} + b = &tableCompactionBuilder{ + s: s, + c: c, + rec: rec, + stat1: new(cStatsStaging), + minSeq: 0, + strict: true, + tableSize: o.CompactionTableSize, + } + stor.SetEmuErrOnce(storage.TypeTable, tsOpSync) + stor.SetEmuRandErr(storage.TypeTable, tsOpRead, tsOpReadAt, tsOpWrite) + stor.SetEmuRandErrProb(0xf0) + for { + if err := b.run(new(compactionTransactCounter)); err != nil { + t.Logf("(expected) b.run: %v", err) + } else { + break + } + } + if err := s.commit(rec); err != nil { + t.Fatal(err) + } + c.release() + + stor.SetEmuErrOnce(0, tsOpSync) + stor.SetEmuRandErr(0, tsOpRead, tsOpReadAt, tsOpWrite) + + v = s.version() + if len(v.tables[1]) != len(v.tables[2]) { + t.Fatalf("invalid tables length, want %d, got %d", len(v.tables[1]), len(v.tables[2])) + } + for i, f0 := range v.tables[1] { + f1 := v.tables[2][i] + iter0 := s.tops.newIterator(f0, nil, nil) + iter1 := s.tops.newIterator(f1, nil, nil) + for j := 0; true; j++ { + next0 := iter0.Next() + next1 := iter1.Next() + if next0 != next1 { + t.Fatalf("#%d.%d invalid eoi: want %v, got %v", i, j, next0, next1) + } + key0 := iter0.Key() + key1 := iter1.Key() + if !bytes.Equal(key0, key1) { + t.Fatalf("#%d.%d invalid key: want %q, got %q", i, j, key0, key1) + } + if next0 == false { + break + } + } + iter0.Release() + iter1.Release() + } + v.release() +} + +func testDB_IterTriggeredCompaction(t *testing.T, limitDiv int) { + const ( + vSize = 200 * opt.KiB + tSize = 100 * opt.MiB + mIter = 100 + n = tSize / vSize + ) + + h := newDbHarnessWopt(t, &opt.Options{ + Compression: opt.NoCompression, + DisableBlockCache: true, + }) + defer h.close() + + key := func(x int) string { + return fmt.Sprintf("v%06d", x) + } + + // Fill. + value := strings.Repeat("x", vSize) + for i := 0; i < n; i++ { + h.put(key(i), value) + } + h.compactMem() + + // Delete all. + for i := 0; i < n; i++ { + h.delete(key(i)) + } + h.compactMem() + + var ( + limit = n / limitDiv + + startKey = key(0) + limitKey = key(limit) + maxKey = key(n) + slice = &util.Range{Limit: []byte(limitKey)} + + initialSize0 = h.sizeOf(startKey, limitKey) + initialSize1 = h.sizeOf(limitKey, maxKey) + ) + + t.Logf("inital size %s [rest %s]", shortenb(int(initialSize0)), shortenb(int(initialSize1))) + + for r := 0; true; r++ { + if r >= mIter { + t.Fatal("taking too long to compact") + } + + // Iterates. + iter := h.db.NewIterator(slice, h.ro) + for iter.Next() { + } + if err := iter.Error(); err != nil { + t.Fatalf("Iter err: %v", err) + } + iter.Release() + + // Wait compaction. + h.waitCompaction() + + // Check size. + size0 := h.sizeOf(startKey, limitKey) + size1 := h.sizeOf(limitKey, maxKey) + t.Logf("#%03d size %s [rest %s]", r, shortenb(int(size0)), shortenb(int(size1))) + if size0 < initialSize0/10 { + break + } + } + + if initialSize1 > 0 { + h.sizeAssert(limitKey, maxKey, initialSize1/4-opt.MiB, initialSize1+opt.MiB) + } +} + +func TestDB_IterTriggeredCompaction(t *testing.T) { + testDB_IterTriggeredCompaction(t, 1) +} + +func TestDB_IterTriggeredCompactionHalf(t *testing.T) { + testDB_IterTriggeredCompaction(t, 2) +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go index 2db007e27..a8a2bdf72 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go @@ -7,8 +7,7 @@ package leveldb import ( - "errors" - + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/storage" @@ -32,40 +31,44 @@ func (p Sizes) Sum() (n uint64) { return n } +// Logging. +func (db *DB) log(v ...interface{}) { db.s.log(v...) } +func (db *DB) logf(format string, v ...interface{}) { db.s.logf(format, v...) } + // Check and clean files. -func (d *DB) checkAndCleanFiles() error { - s := d.s +func (db *DB) checkAndCleanFiles() error { + v := db.s.version() + defer v.release() - v := s.version_NB() - tables := make(map[uint64]bool) - for _, tt := range v.tables { - for _, t := range tt { - tables[t.file.Num()] = false + tablesMap := make(map[uint64]bool) + for _, tables := range v.tables { + for _, t := range tables { + tablesMap[t.file.Num()] = false } } - ff, err := s.getFiles(storage.TypeAll) + files, err := db.s.getFiles(storage.TypeAll) if err != nil { return err } var nTables int var rem []storage.File - for _, f := range ff { + for _, f := range files { keep := true switch f.Type() { case storage.TypeManifest: - keep = f.Num() >= s.manifestFile.Num() + keep = f.Num() >= db.s.manifestFile.Num() case storage.TypeJournal: - if d.frozenJournalFile != nil { - keep = f.Num() >= d.frozenJournalFile.Num() + if db.frozenJournalFile != nil { + keep = f.Num() >= db.frozenJournalFile.Num() } else { - keep = f.Num() >= d.journalFile.Num() + keep = f.Num() >= db.journalFile.Num() } case storage.TypeTable: - _, keep = tables[f.Num()] + _, keep = tablesMap[f.Num()] if keep { - tables[f.Num()] = true + tablesMap[f.Num()] = true nTables++ } } @@ -75,18 +78,20 @@ func (d *DB) checkAndCleanFiles() error { } } - if nTables != len(tables) { - for num, present := range tables { + if nTables != len(tablesMap) { + var missing []*storage.FileInfo + for num, present := range tablesMap { if !present { - s.logf("db@janitor table missing @%d", num) + missing = append(missing, &storage.FileInfo{Type: storage.TypeTable, Num: num}) + db.logf("db@janitor table missing @%d", num) } } - return ErrCorrupted{Type: MissingFiles, Err: errors.New("leveldb: table files missing")} + return errors.NewErrCorrupted(nil, &errors.ErrMissingFiles{Files: missing}) } - s.logf("db@janitor F·%d G·%d", len(ff), len(rem)) + db.logf("db@janitor F·%d G·%d", len(files), len(rem)) for _, f := range rem { - s.logf("db@janitor removing %s-%d", f.Type(), f.Num()) + db.logf("db@janitor removing %s-%d", f.Type(), f.Num()) if err := f.Remove(); err != nil { return err } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go index 4660e840c..e1cf30c53 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go @@ -14,84 +14,93 @@ import ( "github.com/syndtr/goleveldb/leveldb/util" ) -func (d *DB) writeJournal(b *Batch) error { - w, err := d.journal.Next() +func (db *DB) writeJournal(b *Batch) error { + w, err := db.journal.Next() if err != nil { return err } if _, err := w.Write(b.encode()); err != nil { return err } - if err := d.journal.Flush(); err != nil { + if err := db.journal.Flush(); err != nil { return err } if b.sync { - return d.journalWriter.Sync() + return db.journalWriter.Sync() } return nil } -func (d *DB) jWriter() { - defer d.closeW.Done() +func (db *DB) jWriter() { + defer db.closeW.Done() for { select { - case b := <-d.journalC: + case b := <-db.journalC: if b != nil { - d.journalAckC <- d.writeJournal(b) + db.journalAckC <- db.writeJournal(b) } - case _, _ = <-d.closeC: + case _, _ = <-db.closeC: return } } } -func (d *DB) rotateMem(n int) (mem *memdb.DB, err error) { +func (db *DB) rotateMem(n int) (mem *memDB, err error) { // Wait for pending memdb compaction. - err = d.compSendIdle(d.mcompCmdC) + err = db.compSendIdle(db.mcompCmdC) if err != nil { return } // Create new memdb and journal. - mem, err = d.newMem(n) + mem, err = db.newMem(n) if err != nil { return } // Schedule memdb compaction. - d.compTrigger(d.mcompTriggerC) + db.compSendTrigger(db.mcompCmdC) return } -func (d *DB) flush(n int) (mem *memdb.DB, nn int, err error) { - s := d.s - +func (db *DB) flush(n int) (mem *memDB, nn int, err error) { delayed := false - flush := func() bool { - v := s.version() + flush := func() (retry bool) { + v := db.s.version() defer v.release() - mem = d.getEffectiveMem() - nn = mem.Free() + mem = db.getEffectiveMem() + defer func() { + if retry { + mem.decref() + mem = nil + } + }() + nn = mem.mdb.Free() switch { - case v.tLen(0) >= kL0_SlowdownWritesTrigger && !delayed: + case v.tLen(0) >= db.s.o.GetWriteL0SlowdownTrigger() && !delayed: delayed = true time.Sleep(time.Millisecond) case nn >= n: return false - case v.tLen(0) >= kL0_StopWritesTrigger: + case v.tLen(0) >= db.s.o.GetWriteL0PauseTrigger(): delayed = true - err = d.compSendIdle(d.tcompCmdC) + err = db.compSendIdle(db.tcompCmdC) if err != nil { return false } default: // Allow memdb to grow if it has no entry. - if mem.Len() == 0 { + if mem.mdb.Len() == 0 { nn = n - return false + } else { + mem.decref() + mem, err = db.rotateMem(n) + if err == nil { + nn = mem.mdb.Free() + } else { + nn = 0 + } } - mem, err = d.rotateMem(n) - nn = mem.Free() return false } return true @@ -100,7 +109,12 @@ func (d *DB) flush(n int) (mem *memdb.DB, nn int, err error) { for flush() { } if delayed { - s.logf("db@write delayed T·%v", time.Since(start)) + db.writeDelay += time.Since(start) + db.writeDelayN++ + } else if db.writeDelayN > 0 { + db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) + db.writeDelay = 0 + db.writeDelayN = 0 } return } @@ -109,39 +123,45 @@ func (d *DB) flush(n int) (mem *memdb.DB, nn int, err error) { // sequentially. // // It is safe to modify the contents of the arguments after Write returns. -func (d *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) { - err = d.ok() - if err != nil || b == nil || b.len() == 0 { +func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) { + err = db.ok() + if err != nil || b == nil || b.Len() == 0 { return } b.init(wo.GetSync()) // The write happen synchronously. -retry: select { - case d.writeC <- b: - if <-d.writeMergedC { - return <-d.writeAckC + case db.writeC <- b: + if <-db.writeMergedC { + return <-db.writeAckC } - goto retry - case d.writeLockC <- struct{}{}: - case _, _ = <-d.closeC: + case db.writeLockC <- struct{}{}: + case err = <-db.compPerErrC: + return + case _, _ = <-db.closeC: return ErrClosed } merged := 0 + danglingMerge := false defer func() { - <-d.writeLockC + if danglingMerge { + db.writeMergedC <- false + } else { + <-db.writeLockC + } for i := 0; i < merged; i++ { - d.writeAckC <- err + db.writeAckC <- err } }() - mem, memFree, err := d.flush(b.size()) + mem, memFree, err := db.flush(b.size()) if err != nil { return } + defer mem.decref() // Calculate maximum size of the batch. m := 1 << 20 @@ -154,13 +174,13 @@ retry: drain: for b.size() < m && !b.sync { select { - case nb := <-d.writeC: + case nb := <-db.writeC: if b.size()+nb.size() <= m { b.append(nb) - d.writeMergedC <- true + db.writeMergedC <- true merged++ } else { - d.writeMergedC <- false + danglingMerge = true break drain } default: @@ -169,44 +189,52 @@ drain: } // Set batch first seq number relative from last seq. - b.seq = d.seq + 1 + b.seq = db.seq + 1 // Write journal concurrently if it is large enough. if b.size() >= (128 << 10) { // Push the write batch to the journal writer select { - case _, _ = <-d.closeC: + case db.journalC <- b: + // Write into memdb + if berr := b.memReplay(mem.mdb); berr != nil { + panic(berr) + } + case err = <-db.compPerErrC: + return + case _, _ = <-db.closeC: err = ErrClosed return - case d.journalC <- b: - // Write into memdb - b.memReplay(mem) } // Wait for journal writer select { - case _, _ = <-d.closeC: - err = ErrClosed - return - case err = <-d.journalAckC: + case err = <-db.journalAckC: if err != nil { // Revert memdb if error detected - b.revertMemReplay(mem) + if berr := b.revertMemReplay(mem.mdb); berr != nil { + panic(berr) + } return } + case _, _ = <-db.closeC: + err = ErrClosed + return } } else { - err = d.writeJournal(b) + err = db.writeJournal(b) if err != nil { return } - b.memReplay(mem) + if berr := b.memReplay(mem.mdb); berr != nil { + panic(berr) + } } // Set last seq number. - d.addSeq(uint64(b.len())) + db.addSeq(uint64(b.Len())) if b.size() >= memFree { - d.rotateMem(0) + db.rotateMem(0) } return } @@ -215,20 +243,20 @@ drain: // for that key; a DB is not a multi-map. // // It is safe to modify the contents of the arguments after Put returns. -func (d *DB) Put(key, value []byte, wo *opt.WriteOptions) error { +func (db *DB) Put(key, value []byte, wo *opt.WriteOptions) error { b := new(Batch) b.Put(key, value) - return d.Write(b, wo) + return db.Write(b, wo) } // Delete deletes the value for the given key. It returns ErrNotFound if // the DB does not contain the key. // // It is safe to modify the contents of the arguments after Delete returns. -func (d *DB) Delete(key []byte, wo *opt.WriteOptions) error { +func (db *DB) Delete(key []byte, wo *opt.WriteOptions) error { b := new(Batch) b.Delete(key) - return d.Write(b, wo) + return db.Write(b, wo) } func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool { @@ -247,33 +275,37 @@ func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool { // A nil Range.Start is treated as a key before all keys in the DB. // And a nil Range.Limit is treated as a key after all keys in the DB. // Therefore if both is nil then it will compact entire DB. -func (d *DB) CompactRange(r util.Range) error { - if err := d.ok(); err != nil { +func (db *DB) CompactRange(r util.Range) error { + if err := db.ok(); err != nil { return err } + // Lock writer. select { - case d.writeLockC <- struct{}{}: - case _, _ = <-d.closeC: + case db.writeLockC <- struct{}{}: + case err := <-db.compPerErrC: + return err + case _, _ = <-db.closeC: return ErrClosed } // Check for overlaps in memdb. - mem := d.getEffectiveMem() - if isMemOverlaps(d.s.icmp, mem, r.Start, r.Limit) { + mem := db.getEffectiveMem() + defer mem.decref() + if isMemOverlaps(db.s.icmp, mem.mdb, r.Start, r.Limit) { // Memdb compaction. - if _, err := d.rotateMem(0); err != nil { - <-d.writeLockC + if _, err := db.rotateMem(0); err != nil { + <-db.writeLockC return err } - <-d.writeLockC - if err := d.compSendIdle(d.mcompCmdC); err != nil { + <-db.writeLockC + if err := db.compSendIdle(db.mcompCmdC); err != nil { return err } } else { - <-d.writeLockC + <-db.writeLockC } // Table compaction. - return d.compSendRange(d.tcompCmdC, -1, r.Start, r.Limit) + return db.compSendRange(db.tcompCmdC, -1, r.Start, r.Limit) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/doc.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/doc.go index ac9ea3d0c..53f13bb24 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/doc.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/doc.go @@ -37,6 +37,16 @@ // err = iter.Error() // ... // +// Iterate over subset of database content with a particular prefix: +// iter := db.NewIterator(util.BytesPrefix([]byte("foo-")), nil) +// for iter.Next() { +// // Use key/value. +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// // Seek-then-Iterate: // // iter := db.NewIterator(nil, nil) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/error.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/error.go deleted file mode 100644 index 8066bd9a4..000000000 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/error.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2014, Suryandaru Triandana -// All rights reserved. -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -package leveldb - -import ( - "errors" - - "github.com/syndtr/goleveldb/leveldb/util" -) - -var ( - ErrNotFound = util.ErrNotFound - ErrSnapshotReleased = errors.New("leveldb: snapshot released") - ErrIterReleased = errors.New("leveldb: iterator released") - ErrClosed = errors.New("leveldb: closed") -) - -type CorruptionType int - -const ( - CorruptedManifest CorruptionType = iota - MissingFiles -) - -// ErrCorrupted is the type that wraps errors that indicate corruption in -// the database. -type ErrCorrupted struct { - Type CorruptionType - Err error -} - -func (e ErrCorrupted) Error() string { - return e.Err.Error() -} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go new file mode 100644 index 000000000..29d0d2f27 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go @@ -0,0 +1,18 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/errors" +) + +var ( + ErrNotFound = errors.ErrNotFound + ErrSnapshotReleased = errors.New("leveldb: snapshot released") + ErrIterReleased = errors.New("leveldb: iterator released") + ErrClosed = errors.New("leveldb: closed") +) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go new file mode 100644 index 000000000..84b5d6b7b --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go @@ -0,0 +1,76 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package errors provides common error types used throughout leveldb. +package errors + +import ( + "errors" + "fmt" + + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +var ( + ErrNotFound = New("leveldb: not found") + ErrReleased = util.ErrReleased + ErrHasReleaser = util.ErrHasReleaser +) + +// New returns an error that formats as the given text. +func New(text string) error { + return errors.New(text) +} + +// ErrCorrupted is the type that wraps errors that indicate corruption in +// the database. +type ErrCorrupted struct { + File *storage.FileInfo + Err error +} + +func (e *ErrCorrupted) Error() string { + if e.File != nil { + return fmt.Sprintf("%v [file=%v]", e.Err, e.File) + } else { + return e.Err.Error() + } +} + +// NewErrCorrupted creates new ErrCorrupted error. +func NewErrCorrupted(f storage.File, err error) error { + return &ErrCorrupted{storage.NewFileInfo(f), err} +} + +// IsCorrupted returns a boolean indicating whether the error is indicating +// a corruption. +func IsCorrupted(err error) bool { + switch err.(type) { + case *ErrCorrupted: + return true + } + return false +} + +// ErrMissingFiles is the type that indicating a corruption due to missing +// files. +type ErrMissingFiles struct { + Files []*storage.FileInfo +} + +func (e *ErrMissingFiles) Error() string { return "file missing" } + +// SetFile sets 'file info' of the given error with the given file. +// Currently only ErrCorrupted is supported, otherwise will do nothing. +func SetFile(err error, f storage.File) error { + switch x := err.(type) { + case *ErrCorrupted: + x.File = storage.NewFileInfo(f) + return x + } + return err +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/external_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/external_test.go index d7dff04b6..b328ece4e 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/external_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/external_test.go @@ -17,13 +17,14 @@ import ( var _ = testutil.Defer(func() { Describe("Leveldb external", func() { o := &opt.Options{ - BlockCache: opt.NoCache, - BlockRestartInterval: 5, - BlockSize: 50, - Compression: opt.NoCompression, - MaxOpenFiles: 0, - Strict: opt.StrictAll, - WriteBuffer: 1000, + DisableBlockCache: true, + BlockRestartInterval: 5, + BlockSize: 80, + Compression: opt.NoCompression, + OpenFilesCacheCapacity: -1, + Strict: opt.StrictAll, + WriteBuffer: 1000, + CompactionTableSize: 2000, } Describe("write test", func() { @@ -36,22 +37,21 @@ var _ = testutil.Defer(func() { testutil.DoDBTesting(&t) db.TestClose() done <- true - }, 9.0) + }, 20.0) }) Describe("read test", func() { - testutil.AllKeyValueTesting(nil, func(kv testutil.KeyValue) testutil.DB { + testutil.AllKeyValueTesting(nil, nil, func(kv testutil.KeyValue) testutil.DB { // Building the DB. db := newTestingDB(o, nil, nil) kv.IterateShuffled(nil, func(i int, key, value []byte) { err := db.TestPut(key, value) Expect(err).NotTo(HaveOccurred()) }) - testutil.Defer("teardown", func() { - db.TestClose() - }) return db + }, func(db testutil.DB) { + db.(*testingDB).TestClose() }) }) }) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go index 9b4b72741..a23ab05f7 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go @@ -40,13 +40,19 @@ type basicArrayIterator struct { util.BasicReleaser array BasicArray pos int + err error } func (i *basicArrayIterator) Valid() bool { - return i.pos >= 0 && i.pos < i.array.Len() + return i.pos >= 0 && i.pos < i.array.Len() && !i.Released() } func (i *basicArrayIterator) First() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + if i.array.Len() == 0 { i.pos = -1 return false @@ -56,6 +62,11 @@ func (i *basicArrayIterator) First() bool { } func (i *basicArrayIterator) Last() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + n := i.array.Len() if n == 0 { i.pos = 0 @@ -66,6 +77,11 @@ func (i *basicArrayIterator) Last() bool { } func (i *basicArrayIterator) Seek(key []byte) bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + n := i.array.Len() if n == 0 { i.pos = 0 @@ -79,6 +95,11 @@ func (i *basicArrayIterator) Seek(key []byte) bool { } func (i *basicArrayIterator) Next() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + i.pos++ if n := i.array.Len(); i.pos >= n { i.pos = n @@ -88,6 +109,11 @@ func (i *basicArrayIterator) Next() bool { } func (i *basicArrayIterator) Prev() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + i.pos-- if i.pos < 0 { i.pos = -1 @@ -96,7 +122,7 @@ func (i *basicArrayIterator) Prev() bool { return true } -func (i *basicArrayIterator) Error() error { return nil } +func (i *basicArrayIterator) Error() error { return i.err } type arrayIterator struct { basicArrayIterator diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go index 1e99a2bf6..939adbb93 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go @@ -7,6 +7,7 @@ package iterator import ( + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/util" ) @@ -22,13 +23,13 @@ type IteratorIndexer interface { type indexedIterator struct { util.BasicReleaser - index IteratorIndexer - strict bool - strictGet bool + index IteratorIndexer + strict bool - data Iterator - err error - errf func(err error) + data Iterator + err error + errf func(err error) + closed bool } func (i *indexedIterator) setData() { @@ -36,11 +37,6 @@ func (i *indexedIterator) setData() { i.data.Release() } i.data = i.index.Get() - if i.strictGet { - if err := i.data.Error(); err != nil { - i.err = err - } - } } func (i *indexedIterator) clearData() { @@ -50,14 +46,21 @@ func (i *indexedIterator) clearData() { i.data = nil } -func (i *indexedIterator) dataErr() bool { - if i.errf != nil { - if err := i.data.Error(); err != nil { +func (i *indexedIterator) indexErr() { + if err := i.index.Error(); err != nil { + if i.errf != nil { i.errf(err) } + i.err = err } - if i.strict { - if err := i.data.Error(); err != nil { +} + +func (i *indexedIterator) dataErr() bool { + if err := i.data.Error(); err != nil { + if i.errf != nil { + i.errf(err) + } + if i.strict || !errors.IsCorrupted(err) { i.err = err return true } @@ -72,9 +75,13 @@ func (i *indexedIterator) Valid() bool { func (i *indexedIterator) First() bool { if i.err != nil { return false + } else if i.Released() { + i.err = ErrIterReleased + return false } if !i.index.First() { + i.indexErr() i.clearData() return false } @@ -85,9 +92,13 @@ func (i *indexedIterator) First() bool { func (i *indexedIterator) Last() bool { if i.err != nil { return false + } else if i.Released() { + i.err = ErrIterReleased + return false } if !i.index.Last() { + i.indexErr() i.clearData() return false } @@ -105,9 +116,13 @@ func (i *indexedIterator) Last() bool { func (i *indexedIterator) Seek(key []byte) bool { if i.err != nil { return false + } else if i.Released() { + i.err = ErrIterReleased + return false } if !i.index.Seek(key) { + i.indexErr() i.clearData() return false } @@ -125,6 +140,9 @@ func (i *indexedIterator) Seek(key []byte) bool { func (i *indexedIterator) Next() bool { if i.err != nil { return false + } else if i.Released() { + i.err = ErrIterReleased + return false } switch { @@ -136,6 +154,7 @@ func (i *indexedIterator) Next() bool { fallthrough case i.data == nil: if !i.index.Next() { + i.indexErr() return false } i.setData() @@ -147,6 +166,9 @@ func (i *indexedIterator) Next() bool { func (i *indexedIterator) Prev() bool { if i.err != nil { return false + } else if i.Released() { + i.err = ErrIterReleased + return false } switch { @@ -158,6 +180,7 @@ func (i *indexedIterator) Prev() bool { fallthrough case i.data == nil: if !i.index.Prev() { + i.indexErr() return false } i.setData() @@ -206,16 +229,14 @@ func (i *indexedIterator) SetErrorCallback(f func(err error)) { i.errf = f } -// NewIndexedIterator returns an indexed iterator. An index is iterator -// that returns another iterator, a data iterator. A data iterator is the +// NewIndexedIterator returns an 'indexed iterator'. An index is iterator +// that returns another iterator, a 'data iterator'. A 'data iterator' is the // iterator that contains actual key/value pairs. // -// If strict is true then error yield by data iterator will halt the indexed -// iterator, on contrary if strict is false then the indexed iterator will -// ignore those error and move on to the next index. If strictGet is true and -// index.Get() yield an 'error iterator' then the indexed iterator will be halted. -// An 'error iterator' is iterator which its Error() method always return non-nil -// even before any 'seeks method' is called. -func NewIndexedIterator(index IteratorIndexer, strict, strictGet bool) Iterator { - return &indexedIterator{index: index, strict: strict, strictGet: strictGet} +// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true) +// won't be ignored and will halt 'indexed iterator', otherwise the iterator will +// continue to the next 'data iterator'. Corruption on 'index iterator' will not be +// ignored and will halt the iterator. +func NewIndexedIterator(index IteratorIndexer, strict bool) Iterator { + return &indexedIterator{index: index, strict: strict} } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go index 6a89b3830..72a797892 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go @@ -65,7 +65,7 @@ var _ = testutil.Defer(func() { // Test the iterator. t := testutil.IteratorTesting{ KeyValue: kv.Clone(), - Iter: NewIndexedIterator(NewArrayIndexer(index), true, true), + Iter: NewIndexedIterator(NewArrayIndexer(index), true), } testutil.DoIteratorTesting(&t) done <- true diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/iter.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/iter.go index 1b80184e8..c2522860b 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/iter.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/iter.go @@ -14,6 +14,10 @@ import ( "github.com/syndtr/goleveldb/leveldb/util" ) +var ( + ErrIterReleased = errors.New("leveldb/iterator: iterator released") +) + // IteratorSeeker is the interface that wraps the 'seeks method'. type IteratorSeeker interface { // First moves the iterator to the first key/value pair. If the iterator @@ -100,28 +104,13 @@ type ErrorCallbackSetter interface { } type emptyIterator struct { - releaser util.Releaser - released bool - err error + util.BasicReleaser + err error } func (i *emptyIterator) rErr() { - if i.err == nil && i.released { - i.err = errors.New("leveldb/iterator: iterator released") - } -} - -func (i *emptyIterator) Release() { - if i.releaser != nil { - i.releaser.Release() - i.releaser = nil - } - i.released = true -} - -func (i *emptyIterator) SetReleaser(releaser util.Releaser) { - if !i.released { - i.releaser = releaser + if i.err == nil && i.Released() { + i.err = ErrIterReleased } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/iter_suite_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/iter_suite_test.go index 7ec2fc6f2..5ef8d5baf 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/iter_suite_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/iter_suite_test.go @@ -3,15 +3,9 @@ package iterator_test import ( "testing" - . "github.com/onsi/ginkgo" - . "github.com/onsi/gomega" - "github.com/syndtr/goleveldb/leveldb/testutil" ) func TestIterator(t *testing.T) { - testutil.RunDefer() - - RegisterFailHandler(Fail) - RunSpecs(t, "Iterator Suite") + testutil.RunSuite(t, "Iterator Suite") } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go index c8314c4e5..1a7e29df8 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go @@ -7,16 +7,11 @@ package iterator import ( - "errors" - "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/util" ) -var ( - ErrIterReleased = errors.New("leveldb/iterator: iterator released") -) - type dir int const ( @@ -48,13 +43,11 @@ func assertKey(key []byte) []byte { } func (i *mergedIterator) iterErr(iter Iterator) bool { - if i.errf != nil { - if err := iter.Error(); err != nil { + if err := iter.Error(); err != nil { + if i.errf != nil { i.errf(err) } - } - if i.strict { - if err := iter.Error(); err != nil { + if i.strict || !errors.IsCorrupted(err) { i.err = err return true } @@ -274,9 +267,13 @@ func (i *mergedIterator) Release() { } func (i *mergedIterator) SetReleaser(releaser util.Releaser) { - if i.dir != dirReleased { - i.releaser = releaser + if i.dir == dirReleased { + panic(util.ErrReleased) + } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) } + i.releaser = releaser } func (i *mergedIterator) Error() error { @@ -294,9 +291,9 @@ func (i *mergedIterator) SetErrorCallback(f func(err error)) { // keys: if iters[i] contains a key k then iters[j] will not contain that key k. // None of the iters may be nil. // -// If strict is true then error yield by any iterators will halt the merged -// iterator, on contrary if strict is false then the merged iterator will -// ignore those error and move on to the next iterator. +// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true) +// won't be ignored and will halt 'merged iterator', otherwise the iterator will +// continue to the next 'input iterator'. func NewMergedIterator(iters []Iterator, cmp comparer.Comparer, strict bool) Iterator { return &mergedIterator{ iters: iters, diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go index b522c76e6..6519ec660 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go @@ -79,10 +79,10 @@ package journal import ( "encoding/binary" - "errors" "fmt" "io" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/util" ) @@ -103,18 +103,18 @@ type flusher interface { Flush() error } -// DroppedError is the error type that passed to Dropper.Drop method. -type DroppedError struct { +// ErrCorrupted is the error type that generated by corrupted block or chunk. +type ErrCorrupted struct { Size int Reason string } -func (e DroppedError) Error() string { - return fmt.Sprintf("leveldb/journal: dropped %d bytes: %s", e.Size, e.Reason) +func (e *ErrCorrupted) Error() string { + return fmt.Sprintf("leveldb/journal: block/chunk corrupted: %s (%d bytes)", e.Reason, e.Size) } // Dropper is the interface that wrap simple Drop method. The Drop -// method will be called when the journal reader dropping a chunk. +// method will be called when the journal reader dropping a block or chunk. type Dropper interface { Drop(err error) } @@ -158,76 +158,78 @@ func NewReader(r io.Reader, dropper Dropper, strict, checksum bool) *Reader { } } +var errSkip = errors.New("leveldb/journal: skipped") + +func (r *Reader) corrupt(n int, reason string, skip bool) error { + if r.dropper != nil { + r.dropper.Drop(&ErrCorrupted{n, reason}) + } + if r.strict && !skip { + r.err = errors.NewErrCorrupted(nil, &ErrCorrupted{n, reason}) + return r.err + } + return errSkip +} + // nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the // next block into the buffer if necessary. -func (r *Reader) nextChunk(wantFirst, skip bool) error { +func (r *Reader) nextChunk(first bool) error { for { if r.j+headerSize <= r.n { checksum := binary.LittleEndian.Uint32(r.buf[r.j+0 : r.j+4]) length := binary.LittleEndian.Uint16(r.buf[r.j+4 : r.j+6]) chunkType := r.buf[r.j+6] - var err error if checksum == 0 && length == 0 && chunkType == 0 { // Drop entire block. - err = DroppedError{r.n - r.j, "zero header"} + m := r.n - r.j r.i = r.n r.j = r.n + return r.corrupt(m, "zero header", false) } else { m := r.n - r.j r.i = r.j + headerSize r.j = r.j + headerSize + int(length) if r.j > r.n { // Drop entire block. - err = DroppedError{m, "chunk length overflows block"} r.i = r.n r.j = r.n + return r.corrupt(m, "chunk length overflows block", false) } else if r.checksum && checksum != util.NewCRC(r.buf[r.i-1:r.j]).Value() { // Drop entire block. - err = DroppedError{m, "checksum mismatch"} r.i = r.n r.j = r.n + return r.corrupt(m, "checksum mismatch", false) } } - if wantFirst && err == nil && chunkType != fullChunkType && chunkType != firstChunkType { - if skip { - // The chunk are intentionally skipped. - if chunkType == lastChunkType { - skip = false - } - continue - } else { - // Drop the chunk. - err = DroppedError{r.j - r.i + headerSize, "orphan chunk"} - } - } - if err == nil { - r.last = chunkType == fullChunkType || chunkType == lastChunkType - } else { - if r.dropper != nil { - r.dropper.Drop(err) - } - if r.strict { - r.err = err - } + if first && chunkType != fullChunkType && chunkType != firstChunkType { + m := r.j - r.i + r.i = r.j + // Report the error, but skip it. + return r.corrupt(m+headerSize, "orphan chunk", true) } - return err + r.last = chunkType == fullChunkType || chunkType == lastChunkType + return nil } + + // The last block. if r.n < blockSize && r.n > 0 { - // This is the last block. - if r.j != r.n { - r.err = io.ErrUnexpectedEOF - } else { - r.err = io.EOF + if !first { + return r.corrupt(0, "missing chunk part", false) } + r.err = io.EOF return r.err } + + // Read block. n, err := io.ReadFull(r.r, r.buf[:]) - if err != nil && err != io.ErrUnexpectedEOF { - r.err = err - return r.err + if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { + return err } if n == 0 { + if !first { + return r.corrupt(0, "missing chunk part", false) + } r.err = io.EOF return r.err } @@ -237,29 +239,26 @@ func (r *Reader) nextChunk(wantFirst, skip bool) error { // Next returns a reader for the next journal. It returns io.EOF if there are no // more journals. The reader returned becomes stale after the next Next call, -// and should no longer be used. +// and should no longer be used. If strict is false, the reader will returns +// io.ErrUnexpectedEOF error when found corrupted journal. func (r *Reader) Next() (io.Reader, error) { r.seq++ if r.err != nil { return nil, r.err } - skip := !r.last + r.i = r.j for { - r.i = r.j - if r.nextChunk(true, skip) != nil { - // So that 'orphan chunk' drop will be reported. - skip = false - } else { + if err := r.nextChunk(true); err == nil { break - } - if r.err != nil { - return nil, r.err + } else if err != errSkip { + return nil, err } } return &singleReader{r, r.seq, nil}, nil } -// Reset resets the journal reader, allows reuse of the journal reader. +// Reset resets the journal reader, allows reuse of the journal reader. Reset returns +// last accumulated error. func (r *Reader) Reset(reader io.Reader, dropper Dropper, strict, checksum bool) error { r.seq++ err := r.err @@ -296,7 +295,11 @@ func (x *singleReader) Read(p []byte) (int, error) { if r.last { return 0, io.EOF } - if x.err = r.nextChunk(false, false); x.err != nil { + x.err = r.nextChunk(false) + if x.err != nil { + if x.err == errSkip { + x.err = io.ErrUnexpectedEOF + } return 0, x.err } } @@ -320,7 +323,11 @@ func (x *singleReader) ReadByte() (byte, error) { if r.last { return 0, io.EOF } - if x.err = r.nextChunk(false, false); x.err != nil { + x.err = r.nextChunk(false) + if x.err != nil { + if x.err == errSkip { + x.err = io.ErrUnexpectedEOF + } return 0, x.err } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal_test.go index 5e1193ae2..0fcf22599 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal_test.go @@ -12,6 +12,7 @@ package journal import ( "bytes" + "encoding/binary" "fmt" "io" "io/ioutil" @@ -326,3 +327,492 @@ func TestStaleWriter(t *testing.T) { t.Fatalf("stale write #1: unexpected error: %v", err) } } + +func TestCorrupt_MissingLastBlock(t *testing.T) { + buf := new(bytes.Buffer) + + w := NewWriter(buf) + + // First record. + ww, err := w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize-1024)); err != nil { + t.Fatalf("write #0: unexpected error: %v", err) + } + + // Second record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize-headerSize)); err != nil { + t.Fatalf("write #1: unexpected error: %v", err) + } + + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Cut the last block. + b := buf.Bytes()[:blockSize] + r := NewReader(bytes.NewReader(b), dropper{t}, false, true) + + // First read. + rr, err := r.Next() + if err != nil { + t.Fatal(err) + } + n, err := io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #0: %v", err) + } + if n != blockSize-1024 { + t.Fatalf("read #0: got %d bytes want %d", n, blockSize-1024) + } + + // Second read. + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != io.ErrUnexpectedEOF { + t.Fatalf("read #1: unexpected error: %v", err) + } + + if _, err := r.Next(); err != io.EOF { + t.Fatalf("last next: unexpected error: %v", err) + } +} + +func TestCorrupt_CorruptedFirstBlock(t *testing.T) { + buf := new(bytes.Buffer) + + w := NewWriter(buf) + + // First record. + ww, err := w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize/2)); err != nil { + t.Fatalf("write #0: unexpected error: %v", err) + } + + // Second record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize-headerSize)); err != nil { + t.Fatalf("write #1: unexpected error: %v", err) + } + + // Third record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), (blockSize-headerSize)+1)); err != nil { + t.Fatalf("write #2: unexpected error: %v", err) + } + + // Fourth record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), (blockSize-headerSize)+2)); err != nil { + t.Fatalf("write #3: unexpected error: %v", err) + } + + if err := w.Close(); err != nil { + t.Fatal(err) + } + + b := buf.Bytes() + // Corrupting block #0. + for i := 0; i < 1024; i++ { + b[i] = '1' + } + + r := NewReader(bytes.NewReader(b), dropper{t}, false, true) + + // First read (third record). + rr, err := r.Next() + if err != nil { + t.Fatal(err) + } + n, err := io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #0: %v", err) + } + if want := int64(blockSize-headerSize) + 1; n != want { + t.Fatalf("read #0: got %d bytes want %d", n, want) + } + + // Second read (fourth record). + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #1: %v", err) + } + if want := int64(blockSize-headerSize) + 2; n != want { + t.Fatalf("read #1: got %d bytes want %d", n, want) + } + + if _, err := r.Next(); err != io.EOF { + t.Fatalf("last next: unexpected error: %v", err) + } +} + +func TestCorrupt_CorruptedMiddleBlock(t *testing.T) { + buf := new(bytes.Buffer) + + w := NewWriter(buf) + + // First record. + ww, err := w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize/2)); err != nil { + t.Fatalf("write #0: unexpected error: %v", err) + } + + // Second record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize-headerSize)); err != nil { + t.Fatalf("write #1: unexpected error: %v", err) + } + + // Third record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), (blockSize-headerSize)+1)); err != nil { + t.Fatalf("write #2: unexpected error: %v", err) + } + + // Fourth record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), (blockSize-headerSize)+2)); err != nil { + t.Fatalf("write #3: unexpected error: %v", err) + } + + if err := w.Close(); err != nil { + t.Fatal(err) + } + + b := buf.Bytes() + // Corrupting block #1. + for i := 0; i < 1024; i++ { + b[blockSize+i] = '1' + } + + r := NewReader(bytes.NewReader(b), dropper{t}, false, true) + + // First read (first record). + rr, err := r.Next() + if err != nil { + t.Fatal(err) + } + n, err := io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #0: %v", err) + } + if want := int64(blockSize / 2); n != want { + t.Fatalf("read #0: got %d bytes want %d", n, want) + } + + // Second read (second record). + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != io.ErrUnexpectedEOF { + t.Fatalf("read #1: unexpected error: %v", err) + } + + // Third read (fourth record). + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #2: %v", err) + } + if want := int64(blockSize-headerSize) + 2; n != want { + t.Fatalf("read #2: got %d bytes want %d", n, want) + } + + if _, err := r.Next(); err != io.EOF { + t.Fatalf("last next: unexpected error: %v", err) + } +} + +func TestCorrupt_CorruptedLastBlock(t *testing.T) { + buf := new(bytes.Buffer) + + w := NewWriter(buf) + + // First record. + ww, err := w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize/2)); err != nil { + t.Fatalf("write #0: unexpected error: %v", err) + } + + // Second record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize-headerSize)); err != nil { + t.Fatalf("write #1: unexpected error: %v", err) + } + + // Third record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), (blockSize-headerSize)+1)); err != nil { + t.Fatalf("write #2: unexpected error: %v", err) + } + + // Fourth record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), (blockSize-headerSize)+2)); err != nil { + t.Fatalf("write #3: unexpected error: %v", err) + } + + if err := w.Close(); err != nil { + t.Fatal(err) + } + + b := buf.Bytes() + // Corrupting block #3. + for i := len(b) - 1; i > len(b)-1024; i-- { + b[i] = '1' + } + + r := NewReader(bytes.NewReader(b), dropper{t}, false, true) + + // First read (first record). + rr, err := r.Next() + if err != nil { + t.Fatal(err) + } + n, err := io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #0: %v", err) + } + if want := int64(blockSize / 2); n != want { + t.Fatalf("read #0: got %d bytes want %d", n, want) + } + + // Second read (second record). + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #1: %v", err) + } + if want := int64(blockSize - headerSize); n != want { + t.Fatalf("read #1: got %d bytes want %d", n, want) + } + + // Third read (third record). + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #2: %v", err) + } + if want := int64(blockSize-headerSize) + 1; n != want { + t.Fatalf("read #2: got %d bytes want %d", n, want) + } + + // Fourth read (fourth record). + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != io.ErrUnexpectedEOF { + t.Fatalf("read #3: unexpected error: %v", err) + } + + if _, err := r.Next(); err != io.EOF { + t.Fatalf("last next: unexpected error: %v", err) + } +} + +func TestCorrupt_FirstChuckLengthOverflow(t *testing.T) { + buf := new(bytes.Buffer) + + w := NewWriter(buf) + + // First record. + ww, err := w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize/2)); err != nil { + t.Fatalf("write #0: unexpected error: %v", err) + } + + // Second record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize-headerSize)); err != nil { + t.Fatalf("write #1: unexpected error: %v", err) + } + + // Third record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), (blockSize-headerSize)+1)); err != nil { + t.Fatalf("write #2: unexpected error: %v", err) + } + + if err := w.Close(); err != nil { + t.Fatal(err) + } + + b := buf.Bytes() + // Corrupting record #1. + x := blockSize + binary.LittleEndian.PutUint16(b[x+4:], 0xffff) + + r := NewReader(bytes.NewReader(b), dropper{t}, false, true) + + // First read (first record). + rr, err := r.Next() + if err != nil { + t.Fatal(err) + } + n, err := io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #0: %v", err) + } + if want := int64(blockSize / 2); n != want { + t.Fatalf("read #0: got %d bytes want %d", n, want) + } + + // Second read (second record). + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != io.ErrUnexpectedEOF { + t.Fatalf("read #1: unexpected error: %v", err) + } + + if _, err := r.Next(); err != io.EOF { + t.Fatalf("last next: unexpected error: %v", err) + } +} + +func TestCorrupt_MiddleChuckLengthOverflow(t *testing.T) { + buf := new(bytes.Buffer) + + w := NewWriter(buf) + + // First record. + ww, err := w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize/2)); err != nil { + t.Fatalf("write #0: unexpected error: %v", err) + } + + // Second record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), blockSize-headerSize)); err != nil { + t.Fatalf("write #1: unexpected error: %v", err) + } + + // Third record. + ww, err = w.Next() + if err != nil { + t.Fatal(err) + } + if _, err := ww.Write(bytes.Repeat([]byte("0"), (blockSize-headerSize)+1)); err != nil { + t.Fatalf("write #2: unexpected error: %v", err) + } + + if err := w.Close(); err != nil { + t.Fatal(err) + } + + b := buf.Bytes() + // Corrupting record #1. + x := blockSize/2 + headerSize + binary.LittleEndian.PutUint16(b[x+4:], 0xffff) + + r := NewReader(bytes.NewReader(b), dropper{t}, false, true) + + // First read (first record). + rr, err := r.Next() + if err != nil { + t.Fatal(err) + } + n, err := io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #0: %v", err) + } + if want := int64(blockSize / 2); n != want { + t.Fatalf("read #0: got %d bytes want %d", n, want) + } + + // Second read (third record). + rr, err = r.Next() + if err != nil { + t.Fatal(err) + } + n, err = io.Copy(ioutil.Discard, rr) + if err != nil { + t.Fatalf("read #1: %v", err) + } + if want := int64(blockSize-headerSize) + 1; n != want { + t.Fatalf("read #1: got %d bytes want %d", n, want) + } + + if _, err := r.Next(); err != io.EOF { + t.Fatalf("last next: unexpected error: %v", err) + } +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go index b9acf932d..572ae8150 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go @@ -9,15 +9,30 @@ package leveldb import ( "encoding/binary" "fmt" + + "github.com/syndtr/goleveldb/leveldb/errors" ) -type vType int +type ErrIkeyCorrupted struct { + Ikey []byte + Reason string +} + +func (e *ErrIkeyCorrupted) Error() string { + return fmt.Sprintf("leveldb: iKey %q corrupted: %s", e.Ikey, e.Reason) +} + +func newErrIkeyCorrupted(ikey []byte, reason string) error { + return errors.NewErrCorrupted(nil, &ErrIkeyCorrupted{append([]byte{}, ikey...), reason}) +} + +type kType int -func (t vType) String() string { - switch t { - case tDel: +func (kt kType) String() string { + switch kt { + case ktDel: return "d" - case tVal: + case ktVal: return "v" } return "x" @@ -26,16 +41,16 @@ func (t vType) String() string { // Value types encoded as the last component of internal keys. // Don't modify; this value are saved to disk. const ( - tDel vType = iota - tVal + ktDel kType = iota + ktVal ) -// tSeek defines the vType that should be passed when constructing an +// ktSeek defines the kType that should be passed when constructing an // internal key for seeking to a particular sequence number (since we // sort sequence numbers in decreasing order and the value type is // embedded as the low 8 bits in the sequence number in internal keys, // we need to use the highest-numbered ValueType, not the lowest). -const tSeek = tVal +const ktSeek = ktVal const ( // Maximum value possible for sequence number; the 8-bits are @@ -43,7 +58,7 @@ const ( // 64-bit integer. kMaxSeq uint64 = (uint64(1) << 56) - 1 // Maximum value possible for packed sequence number and type. - kMaxNum uint64 = (kMaxSeq << 8) | uint64(tSeek) + kMaxNum uint64 = (kMaxSeq << 8) | uint64(ktSeek) ) // Maximum number encoded in bytes. @@ -55,85 +70,73 @@ func init() { type iKey []byte -func newIKey(ukey []byte, seq uint64, t vType) iKey { - if seq > kMaxSeq || t > tVal { - panic("invalid seq number or value type") +func newIkey(ukey []byte, seq uint64, kt kType) iKey { + if seq > kMaxSeq { + panic("leveldb: invalid sequence number") + } else if kt > ktVal { + panic("leveldb: invalid type") } - b := make(iKey, len(ukey)+8) - copy(b, ukey) - binary.LittleEndian.PutUint64(b[len(ukey):], (seq<<8)|uint64(t)) - return b + ik := make(iKey, len(ukey)+8) + copy(ik, ukey) + binary.LittleEndian.PutUint64(ik[len(ukey):], (seq<<8)|uint64(kt)) + return ik } -func parseIkey(p []byte) (ukey []byte, seq uint64, t vType, ok bool) { - if len(p) < 8 { - return +func parseIkey(ik []byte) (ukey []byte, seq uint64, kt kType, err error) { + if len(ik) < 8 { + return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid length") } - num := binary.LittleEndian.Uint64(p[len(p)-8:]) - seq, t = uint64(num>>8), vType(num&0xff) - if t > tVal { - return + num := binary.LittleEndian.Uint64(ik[len(ik)-8:]) + seq, kt = uint64(num>>8), kType(num&0xff) + if kt > ktVal { + return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid type") } - ukey = p[:len(p)-8] - ok = true + ukey = ik[:len(ik)-8] return } -func validIkey(p []byte) bool { - _, _, _, ok := parseIkey(p) - return ok +func validIkey(ik []byte) bool { + _, _, _, err := parseIkey(ik) + return err == nil } -func (p iKey) assert() { - if p == nil { - panic("nil iKey") +func (ik iKey) assert() { + if ik == nil { + panic("leveldb: nil iKey") } - if len(p) < 8 { - panic(fmt.Sprintf("invalid iKey %q, len=%d", []byte(p), len(p))) + if len(ik) < 8 { + panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid length", []byte(ik), len(ik))) } } -func (p iKey) ok() bool { - if len(p) < 8 { - return false - } - _, _, ok := p.parseNum() - return ok -} - -func (p iKey) ukey() []byte { - p.assert() - return p[:len(p)-8] +func (ik iKey) ukey() []byte { + ik.assert() + return ik[:len(ik)-8] } -func (p iKey) num() uint64 { - p.assert() - return binary.LittleEndian.Uint64(p[len(p)-8:]) +func (ik iKey) num() uint64 { + ik.assert() + return binary.LittleEndian.Uint64(ik[len(ik)-8:]) } -func (p iKey) parseNum() (seq uint64, t vType, ok bool) { - if p == nil { - panic("nil iKey") +func (ik iKey) parseNum() (seq uint64, kt kType) { + num := ik.num() + seq, kt = uint64(num>>8), kType(num&0xff) + if kt > ktVal { + panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid type %#x", []byte(ik), len(ik), kt)) } - if len(p) < 8 { - return - } - num := p.num() - seq, t = uint64(num>>8), vType(num&0xff) - if t > tVal { - return 0, 0, false - } - ok = true return } -func (p iKey) String() string { - if len(p) == 0 { +func (ik iKey) String() string { + if ik == nil { return "" } - if seq, t, ok := p.parseNum(); ok { - return fmt.Sprintf("%s,%s%d", shorten(string(p.ukey())), t, seq) + + if ukey, seq, kt, err := parseIkey(ik); err == nil { + return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq) + } else { + return "" } - return "" } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go index e307cfc1d..30eadf784 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go @@ -15,8 +15,8 @@ import ( var defaultIComparer = &iComparer{comparer.DefaultComparer} -func ikey(key string, seq uint64, t vType) iKey { - return newIKey([]byte(key), uint64(seq), t) +func ikey(key string, seq uint64, kt kType) iKey { + return newIkey([]byte(key), uint64(seq), kt) } func shortSep(a, b []byte) []byte { @@ -37,27 +37,37 @@ func shortSuccessor(b []byte) []byte { return dst } -func testSingleKey(t *testing.T, key string, seq uint64, vt vType) { - ik := ikey(key, seq, vt) +func testSingleKey(t *testing.T, key string, seq uint64, kt kType) { + ik := ikey(key, seq, kt) if !bytes.Equal(ik.ukey(), []byte(key)) { t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key) } - if rseq, rt, ok := ik.parseNum(); ok { + rseq, rt := ik.parseNum() + if rseq != seq { + t.Errorf("seq number does not equal, got %v, want %v", rseq, seq) + } + if rt != kt { + t.Errorf("type does not equal, got %v, want %v", rt, kt) + } + + if rukey, rseq, rt, kerr := parseIkey(ik); kerr == nil { + if !bytes.Equal(rukey, []byte(key)) { + t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key) + } if rseq != seq { t.Errorf("seq number does not equal, got %v, want %v", rseq, seq) } - - if rt != vt { - t.Errorf("type does not equal, got %v, want %v", rt, vt) + if rt != kt { + t.Errorf("type does not equal, got %v, want %v", rt, kt) } } else { - t.Error("cannot parse seq and type") + t.Errorf("key error: %v", kerr) } } -func TestIKey_EncodeDecode(t *testing.T) { +func TestIkey_EncodeDecode(t *testing.T) { keys := []string{"", "k", "hello", "longggggggggggggggggggggg"} seqs := []uint64{ 1, 2, 3, @@ -67,8 +77,8 @@ func TestIKey_EncodeDecode(t *testing.T) { } for _, key := range keys { for _, seq := range seqs { - testSingleKey(t, key, seq, tVal) - testSingleKey(t, "hello", 1, tDel) + testSingleKey(t, key, seq, ktVal) + testSingleKey(t, "hello", 1, ktDel) } } } @@ -79,45 +89,45 @@ func assertBytes(t *testing.T, want, got []byte) { } } -func TestIKeyShortSeparator(t *testing.T) { +func TestIkeyShortSeparator(t *testing.T) { // When user keys are same - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foo", 99, tVal))) - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foo", 101, tVal))) - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foo", 100, tVal))) - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foo", 100, tDel))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foo", 99, ktVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foo", 101, ktVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foo", 100, ktVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foo", 100, ktDel))) // When user keys are misordered - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("bar", 99, tVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("bar", 99, ktVal))) // When user keys are different, but correctly ordered - assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek), - shortSep(ikey("foo", 100, tVal), - ikey("hello", 200, tVal))) + assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek), + shortSep(ikey("foo", 100, ktVal), + ikey("hello", 200, ktVal))) // When start user key is prefix of limit user key - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foobar", 200, tVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foobar", 200, ktVal))) // When limit user key is prefix of start user key - assertBytes(t, ikey("foobar", 100, tVal), - shortSep(ikey("foobar", 100, tVal), - ikey("foo", 200, tVal))) + assertBytes(t, ikey("foobar", 100, ktVal), + shortSep(ikey("foobar", 100, ktVal), + ikey("foo", 200, ktVal))) } -func TestIKeyShortestSuccessor(t *testing.T) { - assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek), - shortSuccessor(ikey("foo", 100, tVal))) - assertBytes(t, ikey("\xff\xff", 100, tVal), - shortSuccessor(ikey("\xff\xff", 100, tVal))) +func TestIkeyShortestSuccessor(t *testing.T) { + assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek), + shortSuccessor(ikey("foo", 100, ktVal))) + assertBytes(t, ikey("\xff\xff", 100, ktVal), + shortSuccessor(ikey("\xff\xff", 100, ktVal))) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/leveldb_suite_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/leveldb_suite_test.go index 245b1fd4d..fefa007a7 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/leveldb_suite_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/leveldb_suite_test.go @@ -3,18 +3,9 @@ package leveldb import ( "testing" - . "github.com/onsi/ginkgo" - . "github.com/onsi/gomega" - "github.com/syndtr/goleveldb/leveldb/testutil" ) -func TestLeveldb(t *testing.T) { - testutil.RunDefer() - - RegisterFailHandler(Fail) - RunSpecs(t, "Leveldb Suite") - - RegisterTestingT(t) - testutil.RunDefer("teardown") +func TestLevelDB(t *testing.T) { + testutil.RunSuite(t, "LevelDB Suite") } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go index 7bcae992a..e5398873b 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go @@ -12,12 +12,14 @@ import ( "sync" "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/util" ) var ( - ErrNotFound = util.ErrNotFound + ErrNotFound = errors.ErrNotFound + ErrIterReleased = errors.New("leveldb/memdb: iterator released") ) const tMaxHeight = 12 @@ -29,6 +31,7 @@ type dbIter struct { node int forward bool key, value []byte + err error } func (i *dbIter) fill(checkStart, checkLimit bool) bool { @@ -59,6 +62,11 @@ func (i *dbIter) Valid() bool { } func (i *dbIter) First() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + i.forward = true i.p.mu.RLock() defer i.p.mu.RUnlock() @@ -71,9 +79,11 @@ func (i *dbIter) First() bool { } func (i *dbIter) Last() bool { - if i.p == nil { + if i.Released() { + i.err = ErrIterReleased return false } + i.forward = false i.p.mu.RLock() defer i.p.mu.RUnlock() @@ -86,9 +96,11 @@ func (i *dbIter) Last() bool { } func (i *dbIter) Seek(key []byte) bool { - if i.p == nil { + if i.Released() { + i.err = ErrIterReleased return false } + i.forward = true i.p.mu.RLock() defer i.p.mu.RUnlock() @@ -100,9 +112,11 @@ func (i *dbIter) Seek(key []byte) bool { } func (i *dbIter) Next() bool { - if i.p == nil { + if i.Released() { + i.err = ErrIterReleased return false } + if i.node == 0 { if !i.forward { return i.First() @@ -117,9 +131,11 @@ func (i *dbIter) Next() bool { } func (i *dbIter) Prev() bool { - if i.p == nil { + if i.Released() { + i.err = ErrIterReleased return false } + if i.node == 0 { if i.forward { return i.Last() @@ -141,10 +157,10 @@ func (i *dbIter) Value() []byte { return i.value } -func (i *dbIter) Error() error { return nil } +func (i *dbIter) Error() error { return i.err } func (i *dbIter) Release() { - if i.p != nil { + if !i.Released() { i.p = nil i.node = 0 i.key = nil @@ -437,6 +453,8 @@ func (p *DB) Reset() { // New creates a new initalized in-memory key/value DB. The capacity // is the initial key/value buffer capacity. The capacity is advisory, // not enforced. +// +// The returned DB instance is goroutine-safe. func New(cmp comparer.BasicComparer, capacity int) *DB { p := &DB{ cmp: cmp, diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb_suite_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb_suite_test.go index 788539a87..18c304b7f 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb_suite_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb_suite_test.go @@ -3,15 +3,9 @@ package memdb import ( "testing" - . "github.com/onsi/ginkgo" - . "github.com/onsi/gomega" - "github.com/syndtr/goleveldb/leveldb/testutil" ) -func TestMemdb(t *testing.T) { - testutil.RunDefer() - - RegisterFailHandler(Fail) - RunSpecs(t, "Memdb Suite") +func TestMemDB(t *testing.T) { + testutil.RunSuite(t, "MemDB Suite") } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb_test.go index f96a9d1ea..5dd6dbc7b 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb_test.go @@ -129,7 +129,7 @@ var _ = testutil.Defer(func() { } return db - }) + }, nil, nil) }) }) }) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go index b940ce427..61f0eadf9 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go @@ -11,6 +11,7 @@ import ( "github.com/syndtr/goleveldb/leveldb/cache" "github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/filter" + "math" ) const ( @@ -19,25 +20,57 @@ const ( GiB = MiB * 1024 ) -const ( - DefaultBlockCacheSize = 8 * MiB - DefaultBlockRestartInterval = 16 - DefaultBlockSize = 4 * KiB - DefaultCompressionType = SnappyCompression - DefaultMaxOpenFiles = 1000 - DefaultWriteBuffer = 4 * MiB +var ( + DefaultBlockCacher = LRUCacher + DefaultBlockCacheCapacity = 8 * MiB + DefaultBlockRestartInterval = 16 + DefaultBlockSize = 4 * KiB + DefaultCompactionExpandLimitFactor = 25 + DefaultCompactionGPOverlapsFactor = 10 + DefaultCompactionL0Trigger = 4 + DefaultCompactionSourceLimitFactor = 1 + DefaultCompactionTableSize = 2 * MiB + DefaultCompactionTableSizeMultiplier = 1.0 + DefaultCompactionTotalSize = 10 * MiB + DefaultCompactionTotalSizeMultiplier = 10.0 + DefaultCompressionType = SnappyCompression + DefaultIteratorSamplingRate = 1 * MiB + DefaultMaxMemCompationLevel = 2 + DefaultNumLevel = 7 + DefaultOpenFilesCacher = LRUCacher + DefaultOpenFilesCacheCapacity = 500 + DefaultWriteBuffer = 4 * MiB + DefaultWriteL0PauseTrigger = 12 + DefaultWriteL0SlowdownTrigger = 8 ) -type noCache struct{} +// Cacher is a caching algorithm. +type Cacher interface { + New(capacity int) cache.Cacher +} + +type CacherFunc struct { + NewFunc func(capacity int) cache.Cacher +} + +func (f *CacherFunc) New(capacity int) cache.Cacher { + if f.NewFunc != nil { + return f.NewFunc(capacity) + } + return nil +} -func (noCache) SetCapacity(capacity int) {} -func (noCache) GetNamespace(id uint64) cache.Namespace { return nil } -func (noCache) Purge(fin cache.PurgeFin) {} -func (noCache) Zap(closed bool) {} +func noCacher(int) cache.Cacher { return nil } -var NoCache cache.Cache = noCache{} +var ( + // LRUCacher is the LRU-cache algorithm. + LRUCacher = &CacherFunc{cache.NewLRU} -// Compression is the per-block compression algorithm to use. + // NoCacher is the value to disable caching algorithm. + NoCacher = &CacherFunc{} +) + +// Compression is the 'sorted table' block compression algorithm to use. type Compression uint func (c Compression) String() string { @@ -59,34 +92,47 @@ const ( nCompression ) -// Strict is the DB strict level. +// Strict is the DB 'strict level'. type Strict uint const ( // If present then a corrupted or invalid chunk or block in manifest - // journal will cause an error istead of being dropped. + // journal will cause an error instead of being dropped. + // This will prevent database with corrupted manifest to be opened. StrictManifest Strict = 1 << iota - // If present then a corrupted or invalid chunk or block in journal - // will cause an error istead of being dropped. - StrictJournal - // If present then journal chunk checksum will be verified. StrictJournalChecksum - // If present then an invalid key/value pair will cause an error - // instead of being skipped. - StrictIterator + // If present then a corrupted or invalid chunk or block in journal + // will cause an error instead of being dropped. + // This will prevent database with corrupted journal to be opened. + StrictJournal // If present then 'sorted table' block checksum will be verified. + // This has effect on both 'read operation' and compaction. StrictBlockChecksum + // If present then a corrupted 'sorted table' will fails compaction. + // The database will enter read-only mode. + StrictCompaction + + // If present then a corrupted 'sorted table' will halts 'read operation'. + StrictReader + + // If present then leveldb.Recover will drop corrupted 'sorted table'. + StrictRecovery + + // This only applicable for ReadOptions, if present then this ReadOptions + // 'strict level' will override global ones. + StrictOverride + // StrictAll enables all strict flags. - StrictAll = StrictManifest | StrictJournal | StrictJournalChecksum | StrictIterator | StrictBlockChecksum + StrictAll = StrictManifest | StrictJournalChecksum | StrictJournal | StrictBlockChecksum | StrictCompaction | StrictReader | StrictRecovery // DefaultStrict is the default strict flags. Specify any strict flags // will override default strict flags as whole (i.e. not OR'ed). - DefaultStrict = StrictJournalChecksum | StrictBlockChecksum + DefaultStrict = StrictJournalChecksum | StrictBlockChecksum | StrictCompaction | StrictReader // NoStrict disables all strict flags. Override default strict flags. NoStrict = ^StrictAll @@ -101,11 +147,17 @@ type Options struct { // The default value is nil AltFilters []filter.Filter - // BlockCache provides per-block caching for LevelDB. Specify NoCache to - // disable block caching. + // BlockCacher provides cache algorithm for LevelDB 'sorted table' block caching. + // Specify NoCacher to disable caching algorithm. // - // By default LevelDB will create LRU-cache with capacity of 8MiB. - BlockCache cache.Cache + // The default value is LRUCacher. + BlockCacher Cacher + + // BlockCacheCapacity defines the capacity of the 'sorted table' block caching. + // Use -1 for zero, this has same effect as specifying NoCacher to BlockCacher. + // + // The default value is 8MiB. + BlockCacheCapacity int // BlockRestartInterval is the number of keys between restart points for // delta encoding of keys. @@ -119,6 +171,73 @@ type Options struct { // The default value is 4KiB. BlockSize int + // CompactionExpandLimitFactor limits compaction size after expanded. + // This will be multiplied by table size limit at compaction target level. + // + // The default value is 25. + CompactionExpandLimitFactor int + + // CompactionGPOverlapsFactor limits overlaps in grandparent (Level + 2) that a + // single 'sorted table' generates. + // This will be multiplied by table size limit at grandparent level. + // + // The default value is 10. + CompactionGPOverlapsFactor int + + // CompactionL0Trigger defines number of 'sorted table' at level-0 that will + // trigger compaction. + // + // The default value is 4. + CompactionL0Trigger int + + // CompactionSourceLimitFactor limits compaction source size. This doesn't apply to + // level-0. + // This will be multiplied by table size limit at compaction target level. + // + // The default value is 1. + CompactionSourceLimitFactor int + + // CompactionTableSize limits size of 'sorted table' that compaction generates. + // The limits for each level will be calculated as: + // CompactionTableSize * (CompactionTableSizeMultiplier ^ Level) + // The multiplier for each level can also fine-tuned using CompactionTableSizeMultiplierPerLevel. + // + // The default value is 2MiB. + CompactionTableSize int + + // CompactionTableSizeMultiplier defines multiplier for CompactionTableSize. + // + // The default value is 1. + CompactionTableSizeMultiplier float64 + + // CompactionTableSizeMultiplierPerLevel defines per-level multiplier for + // CompactionTableSize. + // Use zero to skip a level. + // + // The default value is nil. + CompactionTableSizeMultiplierPerLevel []float64 + + // CompactionTotalSize limits total size of 'sorted table' for each level. + // The limits for each level will be calculated as: + // CompactionTotalSize * (CompactionTotalSizeMultiplier ^ Level) + // The multiplier for each level can also fine-tuned using + // CompactionTotalSizeMultiplierPerLevel. + // + // The default value is 10MiB. + CompactionTotalSize int + + // CompactionTotalSizeMultiplier defines multiplier for CompactionTotalSize. + // + // The default value is 10. + CompactionTotalSizeMultiplier float64 + + // CompactionTotalSizeMultiplierPerLevel defines per-level multiplier for + // CompactionTotalSize. + // Use zero to skip a level. + // + // The default value is nil. + CompactionTotalSizeMultiplierPerLevel []float64 + // Comparer defines a total ordering over the space of []byte keys: a 'less // than' relationship. The same comparison algorithm must be used for reads // and writes over the lifetime of the DB. @@ -126,11 +245,22 @@ type Options struct { // The default value uses the same ordering as bytes.Compare. Comparer comparer.Comparer - // Compression defines the per-block compression to use. + // Compression defines the 'sorted table' block compression to use. // // The default value (DefaultCompression) uses snappy compression. Compression Compression + // DisableBlockCache allows disable use of cache.Cache functionality on + // 'sorted table' block. + // + // The default value is false. + DisableBlockCache bool + + // DisableCompactionBackoff allows disable compaction retry backoff. + // + // The default value is false. + DisableCompactionBackoff bool + // ErrorIfExist defines whether an error should returned if the DB already // exist. // @@ -159,12 +289,37 @@ type Options struct { // The default value is nil. Filter filter.Filter - // MaxOpenFiles defines maximum number of open files to kept around - // (cached). This is not an hard limit, actual open files may exceed - // the defined value. + // IteratorSamplingRate defines approximate gap (in bytes) between read + // sampling of an iterator. The samples will be used to determine when + // compaction should be triggered. + // + // The default is 1MiB. + IteratorSamplingRate int + + // MaxMemCompationLevel defines maximum level a newly compacted 'memdb' + // will be pushed into if doesn't creates overlap. This should less than + // NumLevel. Use -1 for level-0. // - // The default value is 1000. - MaxOpenFiles int + // The default is 2. + MaxMemCompationLevel int + + // NumLevel defines number of database level. The level shouldn't changed + // between opens, or the database will panic. + // + // The default is 7. + NumLevel int + + // OpenFilesCacher provides cache algorithm for open files caching. + // Specify NoCacher to disable caching algorithm. + // + // The default value is LRUCacher. + OpenFilesCacher Cacher + + // OpenFilesCacheCapacity defines the capacity of the open files caching. + // Use -1 for zero, this has same effect as specifying NoCacher to OpenFilesCacher. + // + // The default value is 500. + OpenFilesCacheCapacity int // Strict defines the DB strict level. Strict Strict @@ -177,6 +332,18 @@ type Options struct { // // The default value is 4MiB. WriteBuffer int + + // WriteL0StopTrigger defines number of 'sorted table' at level-0 that will + // pause write. + // + // The default value is 12. + WriteL0PauseTrigger int + + // WriteL0SlowdownTrigger defines number of 'sorted table' at level-0 that + // will trigger write slowdown. + // + // The default value is 8. + WriteL0SlowdownTrigger int } func (o *Options) GetAltFilters() []filter.Filter { @@ -186,11 +353,22 @@ func (o *Options) GetAltFilters() []filter.Filter { return o.AltFilters } -func (o *Options) GetBlockCache() cache.Cache { - if o == nil { +func (o *Options) GetBlockCacher() Cacher { + if o == nil || o.BlockCacher == nil { + return DefaultBlockCacher + } else if o.BlockCacher == NoCacher { return nil } - return o.BlockCache + return o.BlockCacher +} + +func (o *Options) GetBlockCacheCapacity() int { + if o == nil || o.BlockCacheCapacity == 0 { + return DefaultBlockCacheCapacity + } else if o.BlockCacheCapacity < 0 { + return 0 + } + return o.BlockCacheCapacity } func (o *Options) GetBlockRestartInterval() int { @@ -207,6 +385,79 @@ func (o *Options) GetBlockSize() int { return o.BlockSize } +func (o *Options) GetCompactionExpandLimit(level int) int { + factor := DefaultCompactionExpandLimitFactor + if o != nil && o.CompactionExpandLimitFactor > 0 { + factor = o.CompactionExpandLimitFactor + } + return o.GetCompactionTableSize(level+1) * factor +} + +func (o *Options) GetCompactionGPOverlaps(level int) int { + factor := DefaultCompactionGPOverlapsFactor + if o != nil && o.CompactionGPOverlapsFactor > 0 { + factor = o.CompactionGPOverlapsFactor + } + return o.GetCompactionTableSize(level+2) * factor +} + +func (o *Options) GetCompactionL0Trigger() int { + if o == nil || o.CompactionL0Trigger == 0 { + return DefaultCompactionL0Trigger + } + return o.CompactionL0Trigger +} + +func (o *Options) GetCompactionSourceLimit(level int) int { + factor := DefaultCompactionSourceLimitFactor + if o != nil && o.CompactionSourceLimitFactor > 0 { + factor = o.CompactionSourceLimitFactor + } + return o.GetCompactionTableSize(level+1) * factor +} + +func (o *Options) GetCompactionTableSize(level int) int { + var ( + base = DefaultCompactionTableSize + mult float64 + ) + if o != nil { + if o.CompactionTableSize > 0 { + base = o.CompactionTableSize + } + if len(o.CompactionTableSizeMultiplierPerLevel) > level && o.CompactionTableSizeMultiplierPerLevel[level] > 0 { + mult = o.CompactionTableSizeMultiplierPerLevel[level] + } else if o.CompactionTableSizeMultiplier > 0 { + mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level)) + } + } + if mult == 0 { + mult = math.Pow(DefaultCompactionTableSizeMultiplier, float64(level)) + } + return int(float64(base) * mult) +} + +func (o *Options) GetCompactionTotalSize(level int) int64 { + var ( + base = DefaultCompactionTotalSize + mult float64 + ) + if o != nil { + if o.CompactionTotalSize > 0 { + base = o.CompactionTotalSize + } + if len(o.CompactionTotalSizeMultiplierPerLevel) > level && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 { + mult = o.CompactionTotalSizeMultiplierPerLevel[level] + } else if o.CompactionTotalSizeMultiplier > 0 { + mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level)) + } + } + if mult == 0 { + mult = math.Pow(DefaultCompactionTotalSizeMultiplier, float64(level)) + } + return int64(float64(base) * mult) +} + func (o *Options) GetComparer() comparer.Comparer { if o == nil || o.Comparer == nil { return comparer.DefaultComparer @@ -221,6 +472,13 @@ func (o *Options) GetCompression() Compression { return o.Compression } +func (o *Options) GetDisableCompactionBackoff() bool { + if o == nil { + return false + } + return o.DisableCompactionBackoff +} + func (o *Options) GetErrorIfExist() bool { if o == nil { return false @@ -242,11 +500,52 @@ func (o *Options) GetFilter() filter.Filter { return o.Filter } -func (o *Options) GetMaxOpenFiles() int { - if o == nil || o.MaxOpenFiles <= 0 { - return DefaultMaxOpenFiles +func (o *Options) GetIteratorSamplingRate() int { + if o == nil || o.IteratorSamplingRate <= 0 { + return DefaultIteratorSamplingRate + } + return o.IteratorSamplingRate +} + +func (o *Options) GetMaxMemCompationLevel() int { + level := DefaultMaxMemCompationLevel + if o != nil { + if o.MaxMemCompationLevel > 0 { + level = o.MaxMemCompationLevel + } else if o.MaxMemCompationLevel < 0 { + level = 0 + } + } + if level >= o.GetNumLevel() { + return o.GetNumLevel() - 1 + } + return level +} + +func (o *Options) GetNumLevel() int { + if o == nil || o.NumLevel <= 0 { + return DefaultNumLevel + } + return o.NumLevel +} + +func (o *Options) GetOpenFilesCacher() Cacher { + if o == nil || o.OpenFilesCacher == nil { + return DefaultOpenFilesCacher } - return o.MaxOpenFiles + if o.OpenFilesCacher == NoCacher { + return nil + } + return o.OpenFilesCacher +} + +func (o *Options) GetOpenFilesCacheCapacity() int { + if o == nil || o.OpenFilesCacheCapacity == 0 { + return DefaultOpenFilesCacheCapacity + } else if o.OpenFilesCacheCapacity < 0 { + return 0 + } + return o.OpenFilesCacheCapacity } func (o *Options) GetStrict(strict Strict) bool { @@ -263,6 +562,20 @@ func (o *Options) GetWriteBuffer() int { return o.WriteBuffer } +func (o *Options) GetWriteL0PauseTrigger() int { + if o == nil || o.WriteL0PauseTrigger == 0 { + return DefaultWriteL0PauseTrigger + } + return o.WriteL0PauseTrigger +} + +func (o *Options) GetWriteL0SlowdownTrigger() int { + if o == nil || o.WriteL0SlowdownTrigger == 0 { + return DefaultWriteL0SlowdownTrigger + } + return o.WriteL0SlowdownTrigger +} + // ReadOptions holds the optional parameters for 'read operation'. The // 'read operation' includes Get, Find and NewIterator. type ReadOptions struct { @@ -273,8 +586,8 @@ type ReadOptions struct { // The default value is false. DontFillCache bool - // Strict overrides global DB strict level. Only StrictIterator and - // StrictBlockChecksum that does have effects here. + // Strict will be OR'ed with global DB 'strict level' unless StrictOverride + // is present. Currently only StrictReader that has effect here. Strict Strict } @@ -316,3 +629,11 @@ func (wo *WriteOptions) GetSync() bool { } return wo.Sync } + +func GetStrict(o *Options, ro *ReadOptions, strict Strict) bool { + if ro.GetStrict(StrictOverride) { + return ro.GetStrict(strict) + } else { + return o.GetStrict(strict) || ro.GetStrict(strict) + } +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go index fc6a96965..a3d84ef60 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go @@ -7,35 +7,86 @@ package leveldb import ( - "github.com/syndtr/goleveldb/leveldb/cache" "github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/opt" ) -func (s *session) setOptions(o *opt.Options) { - s.o = &opt.Options{} +func dupOptions(o *opt.Options) *opt.Options { + newo := &opt.Options{} if o != nil { - *s.o = *o + *newo = *o + } + if newo.Strict == 0 { + newo.Strict = opt.DefaultStrict } + return newo +} + +func (s *session) setOptions(o *opt.Options) { + no := dupOptions(o) // Alternative filters. if filters := o.GetAltFilters(); len(filters) > 0 { - s.o.AltFilters = make([]filter.Filter, len(filters)) + no.AltFilters = make([]filter.Filter, len(filters)) for i, filter := range filters { - s.o.AltFilters[i] = &iFilter{filter} + no.AltFilters[i] = &iFilter{filter} } } - // Block cache. - switch o.GetBlockCache() { - case nil: - s.o.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize) - case opt.NoCache: - s.o.BlockCache = nil - } // Comparer. s.icmp = &iComparer{o.GetComparer()} - s.o.Comparer = s.icmp + no.Comparer = s.icmp // Filter. if filter := o.GetFilter(); filter != nil { - s.o.Filter = &iFilter{filter} + no.Filter = &iFilter{filter} } + + s.o = &cachedOptions{Options: no} + s.o.cache() +} + +type cachedOptions struct { + *opt.Options + + compactionExpandLimit []int + compactionGPOverlaps []int + compactionSourceLimit []int + compactionTableSize []int + compactionTotalSize []int64 +} + +func (co *cachedOptions) cache() { + numLevel := co.Options.GetNumLevel() + + co.compactionExpandLimit = make([]int, numLevel) + co.compactionGPOverlaps = make([]int, numLevel) + co.compactionSourceLimit = make([]int, numLevel) + co.compactionTableSize = make([]int, numLevel) + co.compactionTotalSize = make([]int64, numLevel) + + for level := 0; level < numLevel; level++ { + co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level) + co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level) + co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level) + co.compactionTableSize[level] = co.Options.GetCompactionTableSize(level) + co.compactionTotalSize[level] = co.Options.GetCompactionTotalSize(level) + } +} + +func (co *cachedOptions) GetCompactionExpandLimit(level int) int { + return co.compactionExpandLimit[level] +} + +func (co *cachedOptions) GetCompactionGPOverlaps(level int) int { + return co.compactionGPOverlaps[level] +} + +func (co *cachedOptions) GetCompactionSourceLimit(level int) int { + return co.compactionSourceLimit[level] +} + +func (co *cachedOptions) GetCompactionTableSize(level int) int { + return co.compactionTableSize[level] +} + +func (co *cachedOptions) GetCompactionTotalSize(level int) int64 { + return co.compactionTotalSize[level] } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go index 6b2a61683..b3906f7fc 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go @@ -7,12 +7,13 @@ package leveldb import ( - "errors" + "fmt" "io" "os" "sync" "sync/atomic" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/opt" @@ -20,18 +21,31 @@ import ( "github.com/syndtr/goleveldb/leveldb/util" ) +type ErrManifestCorrupted struct { + Field string + Reason string +} + +func (e *ErrManifestCorrupted) Error() string { + return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason) +} + +func newErrManifestCorrupted(f storage.File, field, reason string) error { + return errors.NewErrCorrupted(f, &ErrManifestCorrupted{field, reason}) +} + // session represent a persistent database session. type session struct { // Need 64-bit alignment. - stFileNum uint64 // current unused file number + stNextFileNum uint64 // current unused file number stJournalNum uint64 // current journal file number; need external synchronization stPrevJournalNum uint64 // prev journal file number; no longer used; for compatibility with older version of leveldb - stSeq uint64 // last mem compacted seq; need external synchronization + stSeqNum uint64 // last mem compacted seq; need external synchronization stTempFileNum uint64 stor storage.Storage storLock util.Releaser - o *opt.Options + o *cachedOptions icmp *iComparer tops *tOps @@ -39,11 +53,12 @@ type session struct { manifestWriter storage.Writer manifestFile storage.File - stCPtrs [kNumLevels]iKey // compact pointers; need external synchronization - stVersion *version // current version - vmu sync.Mutex + stCompPtrs []iKey // compaction pointers; need external synchronization + stVersion *version // current version + vmu sync.Mutex } +// Creates new initialized session instance. func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) { if stor == nil { return nil, os.ErrInvalid @@ -53,22 +68,20 @@ func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) { return } s = &session{ - stor: stor, - storLock: storLock, + stor: stor, + storLock: storLock, + stCompPtrs: make([]iKey, o.GetNumLevel()), } s.setOptions(o) - s.tops = newTableOps(s, s.o.GetMaxOpenFiles()) - s.setVersion(&version{s: s}) - s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock D·DeletedEntry L·Level Q·SeqNum T·TimeElapsed") + s.tops = newTableOps(s) + s.setVersion(newVersion(s)) + s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock Ke·KeyError D·DroppedEntry L·Level Q·SeqNum T·TimeElapsed") return } // Close session. func (s *session) close() { s.tops.close() - if bc := s.o.GetBlockCache(); bc != nil { - bc.Purge(nil) - } if s.manifest != nil { s.manifest.Close() } @@ -81,6 +94,7 @@ func (s *session) close() { s.stVersion = nil } +// Release session lock. func (s *session) release() { s.storLock.Release() } @@ -98,26 +112,26 @@ func (s *session) recover() (err error) { // Don't return os.ErrNotExist if the underlying storage contains // other files that belong to LevelDB. So the DB won't get trashed. if files, _ := s.stor.GetFiles(storage.TypeAll); len(files) > 0 { - err = ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest file missing")} + err = &errors.ErrCorrupted{File: &storage.FileInfo{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}} } } }() - file, err := s.stor.GetManifest() + m, err := s.stor.GetManifest() if err != nil { return } - reader, err := file.Open() + reader, err := m.Open() if err != nil { return } defer reader.Close() strict := s.o.GetStrict(opt.StrictManifest) - jr := journal.NewReader(reader, dropper{s, file}, strict, true) + jr := journal.NewReader(reader, dropper{s, m}, strict, true) - staging := s.version_NB().newStaging() - rec := &sessionRecord{} + staging := s.stVersion.newStaging() + rec := &sessionRecord{numLevel: s.o.GetNumLevel()} for { var r io.Reader r, err = jr.Next() @@ -126,51 +140,57 @@ func (s *session) recover() (err error) { err = nil break } - return + return errors.SetFile(err, m) } err = rec.decode(r) if err == nil { // save compact pointers - for _, rp := range rec.compactionPointers { - s.stCPtrs[rp.level] = iKey(rp.key) + for _, r := range rec.compPtrs { + s.stCompPtrs[r.level] = iKey(r.ikey) } // commit record to version staging staging.commit(rec) - } else if strict { - return ErrCorrupted{Type: CorruptedManifest, Err: err} } else { - s.logf("manifest error: %v (skipped)", err) + err = errors.SetFile(err, m) + if strict || !errors.IsCorrupted(err) { + return + } else { + s.logf("manifest error: %v (skipped)", errors.SetFile(err, m)) + } } - rec.resetCompactionPointers() + rec.resetCompPtrs() rec.resetAddedTables() rec.resetDeletedTables() } switch { case !rec.has(recComparer): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing comparer name")} + return newErrManifestCorrupted(m, "comparer", "missing") case rec.comparer != s.icmp.uName(): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: comparer mismatch, " + "want '" + s.icmp.uName() + "', " + "got '" + rec.comparer + "'")} - case !rec.has(recNextNum): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing next file number")} + return newErrManifestCorrupted(m, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer)) + case !rec.has(recNextFileNum): + return newErrManifestCorrupted(m, "next-file-num", "missing") case !rec.has(recJournalNum): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing journal file number")} - case !rec.has(recSeq): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing seq number")} + return newErrManifestCorrupted(m, "journal-file-num", "missing") + case !rec.has(recSeqNum): + return newErrManifestCorrupted(m, "seq-num", "missing") } - s.manifestFile = file + s.manifestFile = m s.setVersion(staging.finish()) - s.setFileNum(rec.nextNum) + s.setNextFileNum(rec.nextFileNum) s.recordCommited(rec) return nil } // Commit session; need external synchronization. func (s *session) commit(r *sessionRecord) (err error) { + v := s.version() + defer v.release() + // spawn new version based on current version - nv := s.version_NB().spawn(r) + nv := v.spawn(r) if s.manifest == nil { // manifest journal writer not yet created, create one @@ -189,22 +209,22 @@ func (s *session) commit(r *sessionRecord) (err error) { // Pick a compaction based on current state; need external synchronization. func (s *session) pickCompaction() *compaction { - v := s.version_NB() + v := s.version() var level int var t0 tFiles if v.cScore >= 1 { level = v.cLevel - cp := s.stCPtrs[level] - tt := v.tables[level] - for _, t := range tt { - if cp == nil || s.icmp.Compare(t.max, cp) > 0 { + cptr := s.stCompPtrs[level] + tables := v.tables[level] + for _, t := range tables { + if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 { t0 = append(t0, t) break } } if len(t0) == 0 { - t0 = append(t0, tt[0]) + t0 = append(t0, tables[0]) } } else { if p := atomic.LoadPointer(&v.cSeek); p != nil { @@ -212,29 +232,21 @@ func (s *session) pickCompaction() *compaction { level = ts.level t0 = append(t0, ts.table) } else { + v.release() return nil } } - c := &compaction{s: s, version: v, level: level} - if level == 0 { - min, max := t0.getRange(s.icmp) - t0 = nil - v.tables[0].getOverlaps(min.ukey(), max.ukey(), &t0, false, s.icmp.ucmp) - } - - c.tables[0] = t0 - c.expand() - return c + return newCompaction(s, v, level, t0) } // Create compaction from given level and range; need external synchronization. -func (s *session) getCompactionRange(level int, min, max []byte) *compaction { - v := s.version_NB() +func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction { + v := s.version() - var t0 tFiles - v.tables[level].getOverlaps(min, max, &t0, level != 0, s.icmp.ucmp) + t0 := v.tables[level].getOverlaps(nil, s.icmp, umin, umax, level == 0) if len(t0) == 0 { + v.release() return nil } @@ -243,7 +255,7 @@ func (s *session) getCompactionRange(level int, min, max []byte) *compaction { // and we must not pick one file and drop another older file if the // two files overlap. if level > 0 { - limit := uint64(kMaxTableSize) + limit := uint64(v.s.o.GetCompactionSourceLimit(level)) total := uint64(0) for i, t := range t0 { total += t.size @@ -255,90 +267,124 @@ func (s *session) getCompactionRange(level int, min, max []byte) *compaction { } } - c := &compaction{s: s, version: v, level: level} - c.tables[0] = t0 + return newCompaction(s, v, level, t0) +} + +func newCompaction(s *session, v *version, level int, t0 tFiles) *compaction { + c := &compaction{ + s: s, + v: v, + level: level, + tables: [2]tFiles{t0, nil}, + maxGPOverlaps: uint64(s.o.GetCompactionGPOverlaps(level)), + tPtrs: make([]int, s.o.GetNumLevel()), + } c.expand() + c.save() return c } -// compaction represent a compaction state +// compaction represent a compaction state. type compaction struct { - s *session - version *version + s *session + v *version + + level int + tables [2]tFiles + maxGPOverlaps uint64 + + gp tFiles + gpi int + seenKey bool + gpOverlappedBytes uint64 + imin, imax iKey + tPtrs []int + released bool + + snapGPI int + snapSeenKey bool + snapGPOverlappedBytes uint64 + snapTPtrs []int +} - level int - tables [2]tFiles +func (c *compaction) save() { + c.snapGPI = c.gpi + c.snapSeenKey = c.seenKey + c.snapGPOverlappedBytes = c.gpOverlappedBytes + c.snapTPtrs = append(c.snapTPtrs[:0], c.tPtrs...) +} - gp tFiles - gpidx int - seenKey bool - overlappedBytes uint64 - min, max iKey +func (c *compaction) restore() { + c.gpi = c.snapGPI + c.seenKey = c.snapSeenKey + c.gpOverlappedBytes = c.snapGPOverlappedBytes + c.tPtrs = append(c.tPtrs[:0], c.snapTPtrs...) +} - tPtrs [kNumLevels]int +func (c *compaction) release() { + if !c.released { + c.released = true + c.v.release() + } } // Expand compacted tables; need external synchronization. func (c *compaction) expand() { - s := c.s - v := c.version - - level := c.level - vt0, vt1 := v.tables[level], v.tables[level+1] + limit := uint64(c.s.o.GetCompactionExpandLimit(c.level)) + vt0, vt1 := c.v.tables[c.level], c.v.tables[c.level+1] t0, t1 := c.tables[0], c.tables[1] - min, max := t0.getRange(s.icmp) - vt1.getOverlaps(min.ukey(), max.ukey(), &t1, true, s.icmp.ucmp) - - // Get entire range covered by compaction - amin, amax := append(t0, t1...).getRange(s.icmp) + imin, imax := t0.getRange(c.s.icmp) + // We expand t0 here just incase ukey hop across tables. + t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.level == 0) + if len(t0) != len(c.tables[0]) { + imin, imax = t0.getRange(c.s.icmp) + } + t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false) + // Get entire range covered by compaction. + amin, amax := append(t0, t1...).getRange(c.s.icmp) // See if we can grow the number of inputs in "level" without // changing the number of "level+1" files we pick up. if len(t1) > 0 { - var exp0 tFiles - vt0.getOverlaps(amin.ukey(), amax.ukey(), &exp0, level != 0, s.icmp.ucmp) - if len(exp0) > len(t0) && t1.size()+exp0.size() < kExpCompactionMaxBytes { - var exp1 tFiles - xmin, xmax := exp0.getRange(s.icmp) - vt1.getOverlaps(xmin.ukey(), xmax.ukey(), &exp1, true, s.icmp.ucmp) + exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.level == 0) + if len(exp0) > len(t0) && t1.size()+exp0.size() < limit { + xmin, xmax := exp0.getRange(c.s.icmp) + exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false) if len(exp1) == len(t1) { - s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)", - level, level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())), + c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)", + c.level, c.level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())), len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size()))) - min, max = xmin, xmax + imin, imax = xmin, xmax t0, t1 = exp0, exp1 - amin, amax = append(t0, t1...).getRange(s.icmp) + amin, amax = append(t0, t1...).getRange(c.s.icmp) } } } // Compute the set of grandparent files that overlap this compaction // (parent == level+1; grandparent == level+2) - if level+2 < kNumLevels { - v.tables[level+2].getOverlaps(amin.ukey(), amax.ukey(), &c.gp, true, s.icmp.ucmp) + if c.level+2 < c.s.o.GetNumLevel() { + c.gp = c.v.tables[c.level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false) } c.tables[0], c.tables[1] = t0, t1 - c.min, c.max = min, max + c.imin, c.imax = imin, imax } // Check whether compaction is trivial. func (c *compaction) trivial() bool { - return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= kMaxGrandParentOverlapBytes + return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= c.maxGPOverlaps } -func (c *compaction) isBaseLevelForKey(key []byte) bool { - s := c.s - v := c.version - - for level, tt := range v.tables[c.level+2:] { - for c.tPtrs[level] < len(tt) { - t := tt[c.tPtrs[level]] - if s.icmp.uCompare(key, t.max.ukey()) <= 0 { - // We've advanced far enough - if s.icmp.uCompare(key, t.min.ukey()) >= 0 { - // Key falls in this file's range, so definitely not base level +func (c *compaction) baseLevelForKey(ukey []byte) bool { + for level, tables := range c.v.tables[c.level+2:] { + for c.tPtrs[level] < len(tables) { + t := tables[c.tPtrs[level]] + if c.s.icmp.uCompare(ukey, t.imax.ukey()) <= 0 { + // We've advanced far enough. + if c.s.icmp.uCompare(ukey, t.imin.ukey()) >= 0 { + // Key falls in this file's range, so definitely not base level. return false } break @@ -349,55 +395,61 @@ func (c *compaction) isBaseLevelForKey(key []byte) bool { return true } -func (c *compaction) shouldStopBefore(key iKey) bool { - for ; c.gpidx < len(c.gp); c.gpidx++ { - gp := c.gp[c.gpidx] - if c.s.icmp.Compare(key, gp.max) <= 0 { +func (c *compaction) shouldStopBefore(ikey iKey) bool { + for ; c.gpi < len(c.gp); c.gpi++ { + gp := c.gp[c.gpi] + if c.s.icmp.Compare(ikey, gp.imax) <= 0 { break } if c.seenKey { - c.overlappedBytes += gp.size + c.gpOverlappedBytes += gp.size } } c.seenKey = true - if c.overlappedBytes > kMaxGrandParentOverlapBytes { - // Too much overlap for current output; start new output - c.overlappedBytes = 0 + if c.gpOverlappedBytes > c.maxGPOverlaps { + // Too much overlap for current output; start new output. + c.gpOverlappedBytes = 0 return true } return false } +// Creates an iterator. func (c *compaction) newIterator() iterator.Iterator { - s := c.s - - level := c.level - icap := 2 + // Creates iterator slice. + icap := len(c.tables) if c.level == 0 { + // Special case for level-0 icap = len(c.tables[0]) + 1 } its := make([]iterator.Iterator, 0, icap) + // Options. ro := &opt.ReadOptions{ DontFillCache: true, + Strict: opt.StrictOverride, + } + strict := c.s.o.GetStrict(opt.StrictCompaction) + if strict { + ro.Strict |= opt.StrictReader } - strict := s.o.GetStrict(opt.StrictIterator) - for i, tt := range c.tables { - if len(tt) == 0 { + for i, tables := range c.tables { + if len(tables) == 0 { continue } - if level+i == 0 { - for _, t := range tt { - its = append(its, s.tops.newIterator(t, nil, ro)) + // Level-0 is not sorted and may overlaps each other. + if c.level+i == 0 { + for _, t := range tables { + its = append(its, c.s.tops.newIterator(t, nil, ro)) } } else { - it := iterator.NewIndexedIterator(tt.newIndexIterator(s.tops, s.icmp, nil, ro), strict, true) + it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict) its = append(its, it) } } - return iterator.NewMergedIterator(its, s.icmp, true) + return iterator.NewMergedIterator(its, c.s.icmp, strict) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go index c50fda737..1bdcc68f5 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go @@ -9,11 +9,11 @@ package leveldb import ( "bufio" "encoding/binary" - "errors" "io" -) + "strings" -var errCorruptManifest = errors.New("leveldb: corrupt manifest") + "github.com/syndtr/goleveldb/leveldb/errors" +) type byteReader interface { io.Reader @@ -22,32 +22,28 @@ type byteReader interface { // These numbers are written to disk and should not be changed. const ( - recComparer = 1 - recJournalNum = 2 - recNextNum = 3 - recSeq = 4 - recCompactionPointer = 5 - recDeletedTable = 6 - recNewTable = 7 + recComparer = 1 + recJournalNum = 2 + recNextFileNum = 3 + recSeqNum = 4 + recCompPtr = 5 + recDelTable = 6 + recAddTable = 7 // 8 was used for large value refs recPrevJournalNum = 9 ) type cpRecord struct { level int - key iKey + ikey iKey } -type ntRecord struct { +type atRecord struct { level int num uint64 size uint64 - min iKey - max iKey -} - -func (r ntRecord) makeFile(s *session) *tFile { - return newTFile(s.getTableFile(r.num), r.size, r.min, r.max) + imin iKey + imax iKey } type dtRecord struct { @@ -56,17 +52,20 @@ type dtRecord struct { } type sessionRecord struct { - hasRec int - comparer string - journalNum uint64 - prevJournalNum uint64 - nextNum uint64 - seq uint64 - compactionPointers []cpRecord - addedTables []ntRecord - deletedTables []dtRecord - scratch [binary.MaxVarintLen64]byte - err error + numLevel int + + hasRec int + comparer string + journalNum uint64 + prevJournalNum uint64 + nextFileNum uint64 + seqNum uint64 + compPtrs []cpRecord + addedTables []atRecord + deletedTables []dtRecord + + scratch [binary.MaxVarintLen64]byte + err error } func (p *sessionRecord) has(rec int) bool { @@ -88,47 +87,47 @@ func (p *sessionRecord) setPrevJournalNum(num uint64) { p.prevJournalNum = num } -func (p *sessionRecord) setNextNum(num uint64) { - p.hasRec |= 1 << recNextNum - p.nextNum = num +func (p *sessionRecord) setNextFileNum(num uint64) { + p.hasRec |= 1 << recNextFileNum + p.nextFileNum = num } -func (p *sessionRecord) setSeq(seq uint64) { - p.hasRec |= 1 << recSeq - p.seq = seq +func (p *sessionRecord) setSeqNum(num uint64) { + p.hasRec |= 1 << recSeqNum + p.seqNum = num } -func (p *sessionRecord) addCompactionPointer(level int, key iKey) { - p.hasRec |= 1 << recCompactionPointer - p.compactionPointers = append(p.compactionPointers, cpRecord{level, key}) +func (p *sessionRecord) addCompPtr(level int, ikey iKey) { + p.hasRec |= 1 << recCompPtr + p.compPtrs = append(p.compPtrs, cpRecord{level, ikey}) } -func (p *sessionRecord) resetCompactionPointers() { - p.hasRec &= ^(1 << recCompactionPointer) - p.compactionPointers = p.compactionPointers[:0] +func (p *sessionRecord) resetCompPtrs() { + p.hasRec &= ^(1 << recCompPtr) + p.compPtrs = p.compPtrs[:0] } -func (p *sessionRecord) addTable(level int, num, size uint64, min, max iKey) { - p.hasRec |= 1 << recNewTable - p.addedTables = append(p.addedTables, ntRecord{level, num, size, min, max}) +func (p *sessionRecord) addTable(level int, num, size uint64, imin, imax iKey) { + p.hasRec |= 1 << recAddTable + p.addedTables = append(p.addedTables, atRecord{level, num, size, imin, imax}) } func (p *sessionRecord) addTableFile(level int, t *tFile) { - p.addTable(level, t.file.Num(), t.size, t.min, t.max) + p.addTable(level, t.file.Num(), t.size, t.imin, t.imax) } func (p *sessionRecord) resetAddedTables() { - p.hasRec &= ^(1 << recNewTable) + p.hasRec &= ^(1 << recAddTable) p.addedTables = p.addedTables[:0] } -func (p *sessionRecord) deleteTable(level int, num uint64) { - p.hasRec |= 1 << recDeletedTable +func (p *sessionRecord) delTable(level int, num uint64) { + p.hasRec |= 1 << recDelTable p.deletedTables = append(p.deletedTables, dtRecord{level, num}) } func (p *sessionRecord) resetDeletedTables() { - p.hasRec &= ^(1 << recDeletedTable) + p.hasRec &= ^(1 << recDelTable) p.deletedTables = p.deletedTables[:0] } @@ -161,43 +160,45 @@ func (p *sessionRecord) encode(w io.Writer) error { p.putUvarint(w, recJournalNum) p.putUvarint(w, p.journalNum) } - if p.has(recNextNum) { - p.putUvarint(w, recNextNum) - p.putUvarint(w, p.nextNum) + if p.has(recNextFileNum) { + p.putUvarint(w, recNextFileNum) + p.putUvarint(w, p.nextFileNum) } - if p.has(recSeq) { - p.putUvarint(w, recSeq) - p.putUvarint(w, p.seq) + if p.has(recSeqNum) { + p.putUvarint(w, recSeqNum) + p.putUvarint(w, p.seqNum) } - for _, cp := range p.compactionPointers { - p.putUvarint(w, recCompactionPointer) - p.putUvarint(w, uint64(cp.level)) - p.putBytes(w, cp.key) + for _, r := range p.compPtrs { + p.putUvarint(w, recCompPtr) + p.putUvarint(w, uint64(r.level)) + p.putBytes(w, r.ikey) } - for _, t := range p.deletedTables { - p.putUvarint(w, recDeletedTable) - p.putUvarint(w, uint64(t.level)) - p.putUvarint(w, t.num) + for _, r := range p.deletedTables { + p.putUvarint(w, recDelTable) + p.putUvarint(w, uint64(r.level)) + p.putUvarint(w, r.num) } - for _, t := range p.addedTables { - p.putUvarint(w, recNewTable) - p.putUvarint(w, uint64(t.level)) - p.putUvarint(w, t.num) - p.putUvarint(w, t.size) - p.putBytes(w, t.min) - p.putBytes(w, t.max) + for _, r := range p.addedTables { + p.putUvarint(w, recAddTable) + p.putUvarint(w, uint64(r.level)) + p.putUvarint(w, r.num) + p.putUvarint(w, r.size) + p.putBytes(w, r.imin) + p.putBytes(w, r.imax) } return p.err } -func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 { +func (p *sessionRecord) readUvarintMayEOF(field string, r io.ByteReader, mayEOF bool) uint64 { if p.err != nil { return 0 } x, err := binary.ReadUvarint(r) if err != nil { - if err == io.EOF { - p.err = errCorruptManifest + if err == io.ErrUnexpectedEOF || (mayEOF == false && err == io.EOF) { + p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"}) + } else if strings.HasPrefix(err.Error(), "binary:") { + p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, err.Error()}) } else { p.err = err } @@ -206,35 +207,39 @@ func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 { return x } -func (p *sessionRecord) readBytes(r byteReader) []byte { +func (p *sessionRecord) readUvarint(field string, r io.ByteReader) uint64 { + return p.readUvarintMayEOF(field, r, false) +} + +func (p *sessionRecord) readBytes(field string, r byteReader) []byte { if p.err != nil { return nil } - n := p.readUvarint(r) + n := p.readUvarint(field, r) if p.err != nil { return nil } x := make([]byte, n) _, p.err = io.ReadFull(r, x) if p.err != nil { - if p.err == io.EOF { - p.err = errCorruptManifest + if p.err == io.ErrUnexpectedEOF { + p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"}) } return nil } return x } -func (p *sessionRecord) readLevel(r io.ByteReader) int { +func (p *sessionRecord) readLevel(field string, r io.ByteReader) int { if p.err != nil { return 0 } - x := p.readUvarint(r) + x := p.readUvarint(field, r) if p.err != nil { return 0 } - if x >= kNumLevels { - p.err = errCorruptManifest + if x >= uint64(p.numLevel) { + p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "invalid level number"}) return 0 } return int(x) @@ -247,59 +252,59 @@ func (p *sessionRecord) decode(r io.Reader) error { } p.err = nil for p.err == nil { - rec, err := binary.ReadUvarint(br) - if err != nil { - if err == io.EOF { - err = nil + rec := p.readUvarintMayEOF("field-header", br, true) + if p.err != nil { + if p.err == io.EOF { + return nil } - return err + return p.err } switch rec { case recComparer: - x := p.readBytes(br) + x := p.readBytes("comparer", br) if p.err == nil { p.setComparer(string(x)) } case recJournalNum: - x := p.readUvarint(br) + x := p.readUvarint("journal-num", br) if p.err == nil { p.setJournalNum(x) } case recPrevJournalNum: - x := p.readUvarint(br) + x := p.readUvarint("prev-journal-num", br) if p.err == nil { p.setPrevJournalNum(x) } - case recNextNum: - x := p.readUvarint(br) + case recNextFileNum: + x := p.readUvarint("next-file-num", br) if p.err == nil { - p.setNextNum(x) + p.setNextFileNum(x) } - case recSeq: - x := p.readUvarint(br) + case recSeqNum: + x := p.readUvarint("seq-num", br) if p.err == nil { - p.setSeq(x) + p.setSeqNum(x) } - case recCompactionPointer: - level := p.readLevel(br) - key := p.readBytes(br) + case recCompPtr: + level := p.readLevel("comp-ptr.level", br) + ikey := p.readBytes("comp-ptr.ikey", br) if p.err == nil { - p.addCompactionPointer(level, iKey(key)) + p.addCompPtr(level, iKey(ikey)) } - case recNewTable: - level := p.readLevel(br) - num := p.readUvarint(br) - size := p.readUvarint(br) - min := p.readBytes(br) - max := p.readBytes(br) + case recAddTable: + level := p.readLevel("add-table.level", br) + num := p.readUvarint("add-table.num", br) + size := p.readUvarint("add-table.size", br) + imin := p.readBytes("add-table.imin", br) + imax := p.readBytes("add-table.imax", br) if p.err == nil { - p.addTable(level, num, size, min, max) + p.addTable(level, num, size, imin, imax) } - case recDeletedTable: - level := p.readLevel(br) - num := p.readUvarint(br) + case recDelTable: + level := p.readLevel("del-table.level", br) + num := p.readUvarint("del-table.num", br) if p.err == nil { - p.deleteTable(level, num) + p.delTable(level, num) } } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go index 029fabfe6..c0c035ae3 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go @@ -9,6 +9,8 @@ package leveldb import ( "bytes" "testing" + + "github.com/syndtr/goleveldb/leveldb/opt" ) func decodeEncode(v *sessionRecord) (res bool, err error) { @@ -17,7 +19,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) { if err != nil { return } - v2 := new(sessionRecord) + v2 := &sessionRecord{numLevel: opt.DefaultNumLevel} err = v.decode(b) if err != nil { return @@ -32,7 +34,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) { func TestSessionRecord_EncodeDecode(t *testing.T) { big := uint64(1) << 50 - v := new(sessionRecord) + v := &sessionRecord{numLevel: opt.DefaultNumLevel} i := uint64(0) test := func() { res, err := decodeEncode(v) @@ -47,16 +49,16 @@ func TestSessionRecord_EncodeDecode(t *testing.T) { for ; i < 4; i++ { test() v.addTable(3, big+300+i, big+400+i, - newIKey([]byte("foo"), big+500+1, tVal), - newIKey([]byte("zoo"), big+600+1, tDel)) - v.deleteTable(4, big+700+i) - v.addCompactionPointer(int(i), newIKey([]byte("x"), big+900+1, tVal)) + newIkey([]byte("foo"), big+500+1, ktVal), + newIkey([]byte("zoo"), big+600+1, ktDel)) + v.delTable(4, big+700+i) + v.addCompPtr(int(i), newIkey([]byte("x"), big+900+1, ktVal)) } v.setComparer("foo") v.setJournalNum(big + 100) v.setPrevJournalNum(big + 99) - v.setNextNum(big + 200) - v.setSeq(big + 1000) + v.setNextFileNum(big + 200) + v.setSeqNum(big + 1000) test() } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go index bf412b030..007c02cde 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go @@ -14,7 +14,7 @@ import ( "github.com/syndtr/goleveldb/leveldb/storage" ) -// logging +// Logging. type dropper struct { s *session @@ -22,22 +22,17 @@ type dropper struct { } func (d dropper) Drop(err error) { - if e, ok := err.(journal.DroppedError); ok { + if e, ok := err.(*journal.ErrCorrupted); ok { d.s.logf("journal@drop %s-%d S·%s %q", d.file.Type(), d.file.Num(), shortenb(e.Size), e.Reason) } else { d.s.logf("journal@drop %s-%d %q", d.file.Type(), d.file.Num(), err) } } -func (s *session) log(v ...interface{}) { - s.stor.Log(fmt.Sprint(v...)) -} - -func (s *session) logf(format string, v ...interface{}) { - s.stor.Log(fmt.Sprintf(format, v...)) -} +func (s *session) log(v ...interface{}) { s.stor.Log(fmt.Sprint(v...)) } +func (s *session) logf(format string, v ...interface{}) { s.stor.Log(fmt.Sprintf(format, v...)) } -// file utils +// File utils. func (s *session) getJournalFile(num uint64) storage.File { return s.stor.GetFile(num, storage.TypeJournal) @@ -56,9 +51,14 @@ func (s *session) newTemp() storage.File { return s.stor.GetFile(num, storage.TypeTemp) } -// session state +func (s *session) tableFileFromRecord(r atRecord) *tFile { + return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax) +} + +// Session state. -// Get current version. +// Get current version. This will incr version ref, must call +// version.release (exactly once) after use. func (s *session) version() *version { s.vmu.Lock() defer s.vmu.Unlock() @@ -66,85 +66,80 @@ func (s *session) version() *version { return s.stVersion } -// Get current version; no barrier. -func (s *session) version_NB() *version { - return s.stVersion -} - // Set current version to v. func (s *session) setVersion(v *version) { s.vmu.Lock() - v.ref = 1 + v.ref = 1 // Holds by session. if old := s.stVersion; old != nil { - v.ref++ + v.ref++ // Holds by old version. old.next = v - old.release_NB() + old.releaseNB() } s.stVersion = v s.vmu.Unlock() } // Get current unused file number. -func (s *session) fileNum() uint64 { - return atomic.LoadUint64(&s.stFileNum) +func (s *session) nextFileNum() uint64 { + return atomic.LoadUint64(&s.stNextFileNum) } -// Get current unused file number to num. -func (s *session) setFileNum(num uint64) { - atomic.StoreUint64(&s.stFileNum, num) +// Set current unused file number to num. +func (s *session) setNextFileNum(num uint64) { + atomic.StoreUint64(&s.stNextFileNum, num) } // Mark file number as used. func (s *session) markFileNum(num uint64) { - num += 1 + nextFileNum := num + 1 for { - old, x := s.stFileNum, num + old, x := s.stNextFileNum, nextFileNum if old > x { x = old } - if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) { + if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) { break } } } // Allocate a file number. -func (s *session) allocFileNum() (num uint64) { - return atomic.AddUint64(&s.stFileNum, 1) - 1 +func (s *session) allocFileNum() uint64 { + return atomic.AddUint64(&s.stNextFileNum, 1) - 1 } // Reuse given file number. func (s *session) reuseFileNum(num uint64) { for { - old, x := s.stFileNum, num + old, x := s.stNextFileNum, num if old != x+1 { x = old } - if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) { + if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) { break } } } -// manifest related utils +// Manifest related utils. // Fill given session record obj with current states; need external // synchronization. func (s *session) fillRecord(r *sessionRecord, snapshot bool) { - r.setNextNum(s.fileNum()) + r.setNextFileNum(s.nextFileNum()) if snapshot { if !r.has(recJournalNum) { r.setJournalNum(s.stJournalNum) } - if !r.has(recSeq) { - r.setSeq(s.stSeq) + if !r.has(recSeqNum) { + r.setSeqNum(s.stSeqNum) } - for level, ik := range s.stCPtrs { + for level, ik := range s.stCompPtrs { if ik != nil { - r.addCompactionPointer(level, ik) + r.addCompPtr(level, ik) } } @@ -152,7 +147,7 @@ func (s *session) fillRecord(r *sessionRecord, snapshot bool) { } } -// Mark if record has been commited, this will update session state; +// Mark if record has been committed, this will update session state; // need external synchronization. func (s *session) recordCommited(r *sessionRecord) { if r.has(recJournalNum) { @@ -163,12 +158,12 @@ func (s *session) recordCommited(r *sessionRecord) { s.stPrevJournalNum = r.prevJournalNum } - if r.has(recSeq) { - s.stSeq = r.seq + if r.has(recSeqNum) { + s.stSeqNum = r.seqNum } - for _, p := range r.compactionPointers { - s.stCPtrs[p.level] = iKey(p.key) + for _, p := range r.compPtrs { + s.stCompPtrs[p.level] = iKey(p.ikey) } } @@ -183,10 +178,11 @@ func (s *session) newManifest(rec *sessionRecord, v *version) (err error) { jw := journal.NewWriter(writer) if v == nil { - v = s.version_NB() + v = s.version() + defer v.release() } if rec == nil { - rec = new(sessionRecord) + rec = &sessionRecord{numLevel: s.o.GetNumLevel()} } s.fillRecord(rec, true) v.fillRecord(rec) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go index 75439f6db..46cc9d070 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go @@ -221,7 +221,7 @@ func (fs *fileStorage) GetManifest() (f File, err error) { fs.log(fmt.Sprintf("skipping %s: invalid file name", fn)) continue } - if _, e1 := strconv.ParseUint(fn[7:], 10, 0); e1 != nil { + if _, e1 := strconv.ParseUint(fn[8:], 10, 0); e1 != nil { fs.log(fmt.Sprintf("skipping %s: invalid file num: %v", fn, e1)) continue } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go new file mode 100644 index 000000000..102031bfd --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go @@ -0,0 +1,68 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build solaris + +package storage + +import ( + "os" + "syscall" +) + +type unixFileLock struct { + f *os.File +} + +func (fl *unixFileLock) release() error { + if err := setFileLock(fl.f, false); err != nil { + return err + } + return fl.f.Close() +} + +func newFileLock(path string) (fl fileLock, err error) { + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0644) + if err != nil { + return + } + err = setFileLock(f, true) + if err != nil { + f.Close() + return + } + fl = &unixFileLock{f: f} + return +} + +func setFileLock(f *os.File, lock bool) error { + flock := syscall.Flock_t{ + Type: syscall.F_UNLCK, + Start: 0, + Len: 0, + Whence: 1, + } + if lock { + flock.Type = syscall.F_WRLCK + } + return syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &flock) +} + +func rename(oldpath, newpath string) error { + return os.Rename(oldpath, newpath) +} + +func syncDir(name string) error { + f, err := os.Open(name) + if err != nil { + return err + } + defer f.Close() + if err := f.Sync(); err != nil { + return err + } + return nil +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go index 73499afc8..d0a604b7a 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go @@ -4,7 +4,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -// +build darwin freebsd linux netbsd openbsd +// +build darwin dragonfly freebsd linux netbsd openbsd package storage diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go index de5694888..85dd70b06 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go @@ -67,7 +67,7 @@ type Writer interface { Syncer } -// File is the file. +// File is the file. A file instance must be goroutine-safe. type File interface { // Open opens the file for read. Returns os.ErrNotExist error // if the file does not exist. @@ -94,7 +94,7 @@ type File interface { Remove() error } -// Storage is the storage. +// Storage is the storage. A storage instance must be goroutine-safe. type Storage interface { // Lock locks the storage. Any subsequent attempt to call Lock will fail // until the last lock released. @@ -125,3 +125,33 @@ type Storage interface { // Other methods should not be called after the storage has been closed. Close() error } + +// FileInfo wraps basic file info. +type FileInfo struct { + Type FileType + Num uint64 +} + +func (fi FileInfo) String() string { + switch fi.Type { + case TypeManifest: + return fmt.Sprintf("MANIFEST-%06d", fi.Num) + case TypeJournal: + return fmt.Sprintf("%06d.log", fi.Num) + case TypeTable: + return fmt.Sprintf("%06d.ldb", fi.Num) + case TypeTemp: + return fmt.Sprintf("%06d.tmp", fi.Num) + default: + return fmt.Sprintf("%#x-%d", fi.Type, fi.Num) + } +} + +// NewFileInfo creates new FileInfo from the given File. It will returns nil +// if File is nil. +func NewFileInfo(f File) *FileInfo { + if f == nil { + return nil + } + return &FileInfo{f.Type(), f.Num()} +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go index 27e76d707..dc1f1fb54 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go @@ -11,6 +11,7 @@ import ( "fmt" "io" "io/ioutil" + "math/rand" "os" "path/filepath" "sync" @@ -28,11 +29,25 @@ var ( ) var ( - tsFSEnv = os.Getenv("GOLEVELDB_USEFS") - tsKeepFS = tsFSEnv == "2" - tsFS = tsKeepFS || tsFSEnv == "" || tsFSEnv == "1" - tsMU = &sync.Mutex{} - tsNum = 0 + tsFSEnv = os.Getenv("GOLEVELDB_USEFS") + tsTempdir = os.Getenv("GOLEVELDB_TEMPDIR") + tsKeepFS = tsFSEnv == "2" + tsFS = tsKeepFS || tsFSEnv == "" || tsFSEnv == "1" + tsMU = &sync.Mutex{} + tsNum = 0 +) + +type tsOp uint + +const ( + tsOpOpen tsOp = iota + tsOpCreate + tsOpRead + tsOpReadAt + tsOpWrite + tsOpSync + + tsOpNum ) type tsLock struct { @@ -53,6 +68,9 @@ type tsReader struct { func (tr tsReader) Read(b []byte) (n int, err error) { ts := tr.tf.ts ts.countRead(tr.tf.Type()) + if tr.tf.shouldErrLocked(tsOpRead) { + return 0, errors.New("leveldb.testStorage: emulated read error") + } n, err = tr.Reader.Read(b) if err != nil && err != io.EOF { ts.t.Errorf("E: read error, num=%d type=%v n=%d: %v", tr.tf.Num(), tr.tf.Type(), n, err) @@ -63,6 +81,9 @@ func (tr tsReader) Read(b []byte) (n int, err error) { func (tr tsReader) ReadAt(b []byte, off int64) (n int, err error) { ts := tr.tf.ts ts.countRead(tr.tf.Type()) + if tr.tf.shouldErrLocked(tsOpReadAt) { + return 0, errors.New("leveldb.testStorage: emulated readAt error") + } n, err = tr.Reader.ReadAt(b, off) if err != nil && err != io.EOF { ts.t.Errorf("E: readAt error, num=%d type=%v off=%d n=%d: %v", tr.tf.Num(), tr.tf.Type(), off, n, err) @@ -82,15 +103,12 @@ type tsWriter struct { } func (tw tsWriter) Write(b []byte) (n int, err error) { - ts := tw.tf.ts - ts.mu.Lock() - defer ts.mu.Unlock() - if ts.emuWriteErr&tw.tf.Type() != 0 { + if tw.tf.shouldErrLocked(tsOpWrite) { return 0, errors.New("leveldb.testStorage: emulated write error") } n, err = tw.Writer.Write(b) if err != nil { - ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err) + tw.tf.ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err) } return } @@ -98,23 +116,23 @@ func (tw tsWriter) Write(b []byte) (n int, err error) { func (tw tsWriter) Sync() (err error) { ts := tw.tf.ts ts.mu.Lock() - defer ts.mu.Unlock() for ts.emuDelaySync&tw.tf.Type() != 0 { ts.cond.Wait() } - if ts.emuSyncErr&tw.tf.Type() != 0 { + ts.mu.Unlock() + if tw.tf.shouldErrLocked(tsOpSync) { return errors.New("leveldb.testStorage: emulated sync error") } err = tw.Writer.Sync() if err != nil { - ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err) + tw.tf.ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err) } return } func (tw tsWriter) Close() (err error) { err = tw.Writer.Close() - tw.tf.close("reader", err) + tw.tf.close("writer", err) return } @@ -127,6 +145,16 @@ func (tf tsFile) x() uint64 { return tf.Num()< 0 +// Returns true if given key is after largest key of this table. +func (t *tFile) after(icmp *iComparer, ukey []byte) bool { + return ukey != nil && icmp.uCompare(ukey, t.imax.ukey()) > 0 } -// test if key is before t -func (t *tFile) isBefore(key []byte, ucmp comparer.BasicComparer) bool { - return key != nil && ucmp.Compare(key, t.min.ukey()) < 0 +// Returns true if given key is before smallest key of this table. +func (t *tFile) before(icmp *iComparer, ukey []byte) bool { + return ukey != nil && icmp.uCompare(ukey, t.imin.ukey()) < 0 } -func (t *tFile) incrSeek() int32 { +// Returns true if given key range overlaps with this table key range. +func (t *tFile) overlaps(icmp *iComparer, umin, umax []byte) bool { + return !t.after(icmp, umin) && !t.before(icmp, umax) +} + +// Cosumes one seek and return current seeks left. +func (t *tFile) consumeSeek() int32 { return atomic.AddInt32(&t.seekLeft, -1) } -func newTFile(file storage.File, size uint64, min, max iKey) *tFile { +// Creates new tFile. +func newTableFile(file storage.File, size uint64, imin, imax iKey) *tFile { f := &tFile{ file: file, size: size, - min: min, - max: max, + imin: imin, + imax: imax, } // We arrange to automatically compact this file after @@ -70,33 +77,52 @@ func newTFile(file storage.File, size uint64, min, max iKey) *tFile { return f } -// table files +// tFiles hold multiple tFile. type tFiles []*tFile func (tf tFiles) Len() int { return len(tf) } func (tf tFiles) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] } +func (tf tFiles) nums() string { + x := "[ " + for i, f := range tf { + if i != 0 { + x += ", " + } + x += fmt.Sprint(f.file.Num()) + } + x += " ]" + return x +} + +// Returns true if i smallest key is less than j. +// This used for sort by key in ascending order. func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool { a, b := tf[i], tf[j] - n := icmp.Compare(a.min, b.min) + n := icmp.Compare(a.imin, b.imin) if n == 0 { return a.file.Num() < b.file.Num() } return n < 0 } +// Returns true if i file number is greater than j. +// This used for sort by file number in descending order. func (tf tFiles) lessByNum(i, j int) bool { return tf[i].file.Num() > tf[j].file.Num() } +// Sorts tables by key in ascending order. func (tf tFiles) sortByKey(icmp *iComparer) { sort.Sort(&tFilesSortByKey{tFiles: tf, icmp: icmp}) } +// Sorts tables by file number in descending order. func (tf tFiles) sortByNum() { sort.Sort(&tFilesSortByNum{tFiles: tf}) } +// Returns sum of all tables size. func (tf tFiles) size() (sum uint64) { for _, t := range tf { sum += t.size @@ -104,94 +130,107 @@ func (tf tFiles) size() (sum uint64) { return sum } -func (tf tFiles) searchMin(key iKey, icmp *iComparer) int { +// Searches smallest index of tables whose its smallest +// key is after or equal with given key. +func (tf tFiles) searchMin(icmp *iComparer, ikey iKey) int { return sort.Search(len(tf), func(i int) bool { - return icmp.Compare(tf[i].min, key) >= 0 + return icmp.Compare(tf[i].imin, ikey) >= 0 }) } -func (tf tFiles) searchMax(key iKey, icmp *iComparer) int { +// Searches smallest index of tables whose its largest +// key is after or equal with given key. +func (tf tFiles) searchMax(icmp *iComparer, ikey iKey) int { return sort.Search(len(tf), func(i int) bool { - return icmp.Compare(tf[i].max, key) >= 0 + return icmp.Compare(tf[i].imax, ikey) >= 0 }) } -func (tf tFiles) isOverlaps(min, max []byte, disjSorted bool, icmp *iComparer) bool { - if !disjSorted { - // Need to check against all files +// Returns true if given key range overlaps with one or more +// tables key range. If unsorted is true then binary search will not be used. +func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) bool { + if unsorted { + // Check against all files. for _, t := range tf { - if !t.isAfter(min, icmp.ucmp) && !t.isBefore(max, icmp.ucmp) { + if t.overlaps(icmp, umin, umax) { return true } } return false } - var idx int - if len(min) > 0 { - // Find the earliest possible internal key for min - idx = tf.searchMax(newIKey(min, kMaxSeq, tSeek), icmp) + i := 0 + if len(umin) > 0 { + // Find the earliest possible internal key for min. + i = tf.searchMax(icmp, newIkey(umin, kMaxSeq, ktSeek)) } - - if idx >= len(tf) { - // beginning of range is after all files, so no overlap + if i >= len(tf) { + // Beginning of range is after all files, so no overlap. return false } - return !tf[idx].isBefore(max, icmp.ucmp) + return !tf[i].before(icmp, umax) } -func (tf tFiles) getOverlaps(min, max []byte, r *tFiles, disjSorted bool, ucmp comparer.BasicComparer) { +// Returns tables whose its key range overlaps with given key range. +// Range will be expanded if ukey found hop across tables. +// If overlapped is true then the search will be restarted if umax +// expanded. +// The dst content will be overwritten. +func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles { + dst = dst[:0] for i := 0; i < len(tf); { t := tf[i] - i++ - if t.isAfter(min, ucmp) || t.isBefore(max, ucmp) { - continue - } - - *r = append(*r, t) - if !disjSorted { - // Level-0 files may overlap each other. So check if the newly - // added file has expanded the range. If so, restart search. - if min != nil && ucmp.Compare(t.min.ukey(), min) < 0 { - min = t.min.ukey() - *r = nil - i = 0 - } else if max != nil && ucmp.Compare(t.max.ukey(), max) > 0 { - max = t.max.ukey() - *r = nil + if t.overlaps(icmp, umin, umax) { + if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 { + umin = t.imin.ukey() + dst = dst[:0] i = 0 + continue + } else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 { + umax = t.imax.ukey() + // Restart search if it is overlapped. + if overlapped { + dst = dst[:0] + i = 0 + continue + } } + + dst = append(dst, t) } + i++ } - return + return dst } -func (tf tFiles) getRange(icmp *iComparer) (min, max iKey) { +// Returns tables key range. +func (tf tFiles) getRange(icmp *iComparer) (imin, imax iKey) { for i, t := range tf { if i == 0 { - min, max = t.min, t.max + imin, imax = t.imin, t.imax continue } - if icmp.Compare(t.min, min) < 0 { - min = t.min + if icmp.Compare(t.imin, imin) < 0 { + imin = t.imin } - if icmp.Compare(t.max, max) > 0 { - max = t.max + if icmp.Compare(t.imax, imax) > 0 { + imax = t.imax } } return } +// Creates iterator index from tables. func (tf tFiles) newIndexIterator(tops *tOps, icmp *iComparer, slice *util.Range, ro *opt.ReadOptions) iterator.IteratorIndexer { if slice != nil { var start, limit int if slice.Start != nil { - start = tf.searchMax(iKey(slice.Start), icmp) + start = tf.searchMax(icmp, iKey(slice.Start)) } if slice.Limit != nil { - limit = tf.searchMin(iKey(slice.Limit), icmp) + limit = tf.searchMin(icmp, iKey(slice.Limit)) } else { limit = tf.Len() } @@ -206,6 +245,7 @@ func (tf tFiles) newIndexIterator(tops *tOps, icmp *iComparer, slice *util.Range }) } +// Tables iterator index. type tFilesArrayIndexer struct { tFiles tops *tOps @@ -215,7 +255,7 @@ type tFilesArrayIndexer struct { } func (a *tFilesArrayIndexer) Search(key []byte) int { - return a.searchMax(iKey(key), a.icmp) + return a.searchMax(a.icmp, iKey(key)) } func (a *tFilesArrayIndexer) Get(i int) iterator.Iterator { @@ -225,6 +265,7 @@ func (a *tFilesArrayIndexer) Get(i int) iterator.Iterator { return a.tops.newIterator(a.tFiles[i], nil, a.ro) } +// Helper type for sortByKey. type tFilesSortByKey struct { tFiles icmp *iComparer @@ -234,6 +275,7 @@ func (x *tFilesSortByKey) Less(i, j int) bool { return x.lessByKey(x.icmp, i, j) } +// Helper type for sortByNum. type tFilesSortByNum struct { tFiles } @@ -242,19 +284,15 @@ func (x *tFilesSortByNum) Less(i, j int) bool { return x.lessByNum(i, j) } -// table operations +// Table operations. type tOps struct { - s *session - cache cache.Cache - cacheNS cache.Namespace -} - -func newTableOps(s *session, cacheCap int) *tOps { - c := cache.NewLRUCache(cacheCap) - ns := c.GetNamespace(0) - return &tOps{s, c, ns} + s *session + cache *cache.Cache + bcache *cache.Cache + bpool *util.BufferPool } +// Creates an empty table and returns table writer. func (t *tOps) create() (*tWriter, error) { file := t.s.getTableFile(t.s.allocFileNum()) fw, err := file.Create() @@ -265,14 +303,15 @@ func (t *tOps) create() (*tWriter, error) { t: t, file: file, w: fw, - tw: table.NewWriter(fw, t.s.o), + tw: table.NewWriter(fw, t.s.o.Options), }, nil } +// Builds table from src iterator. func (t *tOps) createFrom(src iterator.Iterator) (f *tFile, n int, err error) { w, err := t.create() if err != nil { - return f, n, err + return } defer func() { @@ -282,7 +321,7 @@ func (t *tOps) createFrom(src iterator.Iterator) (f *tFile, n int, err error) { }() for src.Next() { - err = w.add(src.Key(), src.Value()) + err = w.append(src.Key(), src.Value()) if err != nil { return } @@ -297,84 +336,132 @@ func (t *tOps) createFrom(src iterator.Iterator) (f *tFile, n int, err error) { return } -func (t *tOps) lookup(f *tFile) (c cache.Object, err error) { +// Opens table. It returns a cache handle, which should +// be released after use. +func (t *tOps) open(f *tFile) (ch *cache.Handle, err error) { num := f.file.Num() - c, ok := t.cacheNS.Get(num, func() (ok bool, value interface{}, charge int, fin cache.SetFin) { + ch = t.cache.Get(0, num, func() (size int, value cache.Value) { var r storage.Reader r, err = f.file.Open() if err != nil { - return + return 0, nil } - o := t.s.o - - var cacheNS cache.Namespace - if bc := o.GetBlockCache(); bc != nil { - cacheNS = bc.GetNamespace(num) + var bcache *cache.CacheGetter + if t.bcache != nil { + bcache = &cache.CacheGetter{Cache: t.bcache, NS: num} } - ok = true - value = table.NewReader(r, int64(f.size), cacheNS, o) - charge = 1 - fin = func() { + var tr *table.Reader + tr, err = table.NewReader(r, int64(f.size), storage.NewFileInfo(f.file), bcache, t.bpool, t.s.o.Options) + if err != nil { r.Close() + return 0, nil } - return + return 1, tr + }) - if !ok && err == nil { + if ch == nil && err == nil { err = ErrClosed } return } -func (t *tOps) get(f *tFile, key []byte, ro *opt.ReadOptions) (rkey, rvalue []byte, err error) { - c, err := t.lookup(f) +// Finds key/value pair whose key is greater than or equal to the +// given key. +func (t *tOps) find(f *tFile, key []byte, ro *opt.ReadOptions) (rkey, rvalue []byte, err error) { + ch, err := t.open(f) if err != nil { return nil, nil, err } - defer c.Release() - return c.Value().(*table.Reader).Find(key, ro) + defer ch.Release() + return ch.Value().(*table.Reader).Find(key, true, ro) +} + +// Finds key that is greater than or equal to the given key. +func (t *tOps) findKey(f *tFile, key []byte, ro *opt.ReadOptions) (rkey []byte, err error) { + ch, err := t.open(f) + if err != nil { + return nil, err + } + defer ch.Release() + return ch.Value().(*table.Reader).FindKey(key, true, ro) } +// Returns approximate offset of the given key. func (t *tOps) offsetOf(f *tFile, key []byte) (offset uint64, err error) { - c, err := t.lookup(f) + ch, err := t.open(f) if err != nil { return } - _offset, err := c.Value().(*table.Reader).OffsetOf(key) - offset = uint64(_offset) - c.Release() - return + defer ch.Release() + offset_, err := ch.Value().(*table.Reader).OffsetOf(key) + return uint64(offset_), err } +// Creates an iterator from the given table. func (t *tOps) newIterator(f *tFile, slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { - c, err := t.lookup(f) + ch, err := t.open(f) if err != nil { return iterator.NewEmptyIterator(err) } - iter := c.Value().(*table.Reader).NewIterator(slice, ro) - iter.SetReleaser(c) + iter := ch.Value().(*table.Reader).NewIterator(slice, ro) + iter.SetReleaser(ch) return iter } +// Removes table from persistent storage. It waits until +// no one use the the table. func (t *tOps) remove(f *tFile) { num := f.file.Num() - t.cacheNS.Delete(num, func(exist bool) { + t.cache.Delete(0, num, func() { if err := f.file.Remove(); err != nil { t.s.logf("table@remove removing @%d %q", num, err) } else { t.s.logf("table@remove removed @%d", num) } - if bc := t.s.o.GetBlockCache(); bc != nil { - bc.GetNamespace(num).Zap(false) + if t.bcache != nil { + t.bcache.EvictNS(num) } }) } +// Closes the table ops instance. It will close all tables, +// regadless still used or not. func (t *tOps) close() { - t.cache.Zap(true) + t.bpool.Close() + t.cache.Close() + if t.bcache != nil { + t.bcache.Close() + } +} + +// Creates new initialized table ops instance. +func newTableOps(s *session) *tOps { + var ( + cacher cache.Cacher + bcache *cache.Cache + ) + if s.o.GetOpenFilesCacheCapacity() > 0 { + cacher = cache.NewLRU(s.o.GetOpenFilesCacheCapacity()) + } + if !s.o.DisableBlockCache { + var bcacher cache.Cacher + if s.o.GetBlockCacheCapacity() > 0 { + bcacher = cache.NewLRU(s.o.GetBlockCacheCapacity()) + } + bcache = cache.NewCache(bcacher) + } + return &tOps{ + s: s, + cache: cache.NewCache(cacher), + bcache: bcache, + bpool: util.NewBufferPool(s.o.GetBlockSize() + 5), + } } +// tWriter wraps the table writer. It keep track of file descriptor +// and added key range. type tWriter struct { t *tOps @@ -385,7 +472,8 @@ type tWriter struct { first, last []byte } -func (w *tWriter) add(key, value []byte) error { +// Append key/value pair to the table. +func (w *tWriter) append(key, value []byte) error { if w.first == nil { w.first = append([]byte{}, key...) } @@ -393,30 +481,39 @@ func (w *tWriter) add(key, value []byte) error { return w.tw.Append(key, value) } +// Returns true if the table is empty. func (w *tWriter) empty() bool { return w.first == nil } +// Closes the storage.Writer. +func (w *tWriter) close() { + if w.w != nil { + w.w.Close() + w.w = nil + } +} + +// Finalizes the table and returns table file. func (w *tWriter) finish() (f *tFile, err error) { + defer w.close() err = w.tw.Close() if err != nil { return } err = w.w.Sync() if err != nil { - w.w.Close() return } - w.w.Close() - f = newTFile(w.file, uint64(w.tw.BytesLen()), iKey(w.first), iKey(w.last)) + f = newTableFile(w.file, uint64(w.tw.BytesLen()), iKey(w.first), iKey(w.last)) return } +// Drops the table. func (w *tWriter) drop() { - w.w.Close() + w.close() w.file.Remove() w.t.s.reuseFileNum(w.file.Num()) - w.w = nil w.file = nil w.tw = nil w.first = nil diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/block_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/block_test.go index ca598f4f5..00e6f9eea 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/block_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/block_test.go @@ -19,13 +19,18 @@ import ( "github.com/syndtr/goleveldb/leveldb/util" ) -func (b *block) TestNewIterator(slice *util.Range) iterator.Iterator { - return b.newIterator(slice, false, nil) +type blockTesting struct { + tr *Reader + b *block +} + +func (t *blockTesting) TestNewIterator(slice *util.Range) iterator.Iterator { + return t.tr.newBlockIter(t.b, nil, slice, false) } var _ = testutil.Defer(func() { Describe("Block", func() { - Build := func(kv *testutil.KeyValue, restartInterval int) *block { + Build := func(kv *testutil.KeyValue, restartInterval int) *blockTesting { // Building the block. bw := &blockWriter{ restartInterval: restartInterval, @@ -39,11 +44,13 @@ var _ = testutil.Defer(func() { // Opening the block. data := bw.buf.Bytes() restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:])) - return &block{ - cmp: comparer.DefaultComparer, - data: data, - restartsLen: restartsLen, - restartsOffset: len(data) - (restartsLen+1)*4, + return &blockTesting{ + tr: &Reader{cmp: comparer.DefaultComparer}, + b: &block{ + data: data, + restartsLen: restartsLen, + restartsOffset: len(data) - (restartsLen+1)*4, + }, } } @@ -59,7 +66,7 @@ var _ = testutil.Defer(func() { // Make block. br := Build(kv, restartInterval) // Do testing. - testutil.KeyValueTesting(nil, br, kv.Clone()) + testutil.KeyValueTesting(nil, kv.Clone(), br, nil, nil) } Describe(Text(), Test) @@ -102,11 +109,11 @@ var _ = testutil.Defer(func() { for restartInterval := 1; restartInterval <= 5; restartInterval++ { Describe(fmt.Sprintf("with restart interval of %d", restartInterval), func() { // Make block. - br := Build(kv, restartInterval) + bt := Build(kv, restartInterval) Test := func(r *util.Range) func(done Done) { return func(done Done) { - iter := br.newIterator(r, false, nil) + iter := bt.TestNewIterator(r) Expect(iter.Error()).ShouldNot(HaveOccurred()) t := testutil.IteratorTesting{ @@ -115,6 +122,7 @@ var _ = testutil.Defer(func() { } testutil.DoIteratorTesting(&t) + iter.Release() done <- true } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go index 8acb9f720..6f38e84b3 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go @@ -8,27 +8,41 @@ package table import ( "encoding/binary" - "errors" "fmt" "io" "sort" "strings" + "sync" - "code.google.com/p/snappy-go/snappy" + "github.com/syndtr/gosnappy/snappy" "github.com/syndtr/goleveldb/leveldb/cache" "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" "github.com/syndtr/goleveldb/leveldb/util" ) var ( - ErrNotFound = util.ErrNotFound - ErrIterReleased = errors.New("leveldb/table: iterator released") + ErrNotFound = errors.ErrNotFound + ErrReaderReleased = errors.New("leveldb/table: reader released") + ErrIterReleased = errors.New("leveldb/table: iterator released") ) +type ErrCorrupted struct { + Pos int64 + Size int64 + Kind string + Reason string +} + +func (e *ErrCorrupted) Error() string { + return fmt.Sprintf("leveldb/table: corruption on %s (pos=%d): %s", e.Kind, e.Pos, e.Reason) +} + func max(x, y int) int { if x > y { return x @@ -37,40 +51,33 @@ func max(x, y int) int { } type block struct { - cmp comparer.BasicComparer + bpool *util.BufferPool + bh blockHandle data []byte restartsLen int restartsOffset int - // Whether checksum is verified and valid. - checksum bool } -func (b *block) seek(rstart, rlimit int, key []byte) (index, offset int, err error) { - n := b.restartsOffset - data := b.data - cmp := b.cmp - +func (b *block) seek(cmp comparer.Comparer, rstart, rlimit int, key []byte) (index, offset int, err error) { index = sort.Search(b.restartsLen-rstart-(b.restartsLen-rlimit), func(i int) bool { - offset := int(binary.LittleEndian.Uint32(data[n+4*(rstart+i):])) - offset += 1 // shared always zero, since this is a restart point - v1, n1 := binary.Uvarint(data[offset:]) // key length - _, n2 := binary.Uvarint(data[offset+n1:]) // value length + offset := int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*(rstart+i):])) + offset += 1 // shared always zero, since this is a restart point + v1, n1 := binary.Uvarint(b.data[offset:]) // key length + _, n2 := binary.Uvarint(b.data[offset+n1:]) // value length m := offset + n1 + n2 - return cmp.Compare(data[m:m+int(v1)], key) > 0 + return cmp.Compare(b.data[m:m+int(v1)], key) > 0 }) + rstart - 1 if index < rstart { // The smallest key is greater-than key sought. index = rstart } - offset = int(binary.LittleEndian.Uint32(data[n+4*index:])) + offset = int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*index:])) return } func (b *block) restartIndex(rstart, rlimit, offset int) int { - n := b.restartsOffset - data := b.data return sort.Search(b.restartsLen-rstart-(b.restartsLen-rlimit), func(i int) bool { - return int(binary.LittleEndian.Uint32(data[n+4*(rstart+i):])) > offset + return int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*(rstart+i):])) > offset }) + rstart - 1 } @@ -81,7 +88,7 @@ func (b *block) restartOffset(index int) int { func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) { if offset >= b.restartsOffset { if offset != b.restartsOffset { - err = errors.New("leveldb/table: Reader: BlockEntry: invalid block (block entries offset not aligned)") + err = &ErrCorrupted{Reason: "entries offset not aligned"} } return } @@ -91,7 +98,7 @@ func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) m := n0 + n1 + n2 n = m + int(v1) + int(v2) if n0 <= 0 || n1 <= 0 || n2 <= 0 || offset+n > b.restartsOffset { - err = errors.New("leveldb/table: Reader: invalid block (block entries corrupted)") + err = &ErrCorrupted{Reason: "entries corrupted"} return } key = b.data[offset+m : offset+m+int(v1)] @@ -100,43 +107,10 @@ func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) return } -func (b *block) newIterator(slice *util.Range, inclLimit bool, cache util.Releaser) *blockIter { - bi := &blockIter{ - block: b, - cache: cache, - // Valid key should never be nil. - key: make([]byte, 0), - dir: dirSOI, - riStart: 0, - riLimit: b.restartsLen, - offsetStart: 0, - offsetRealStart: 0, - offsetLimit: b.restartsOffset, - } - if slice != nil { - if slice.Start != nil { - if bi.Seek(slice.Start) { - bi.riStart = b.restartIndex(bi.restartIndex, b.restartsLen, bi.prevOffset) - bi.offsetStart = b.restartOffset(bi.riStart) - bi.offsetRealStart = bi.prevOffset - } else { - bi.riStart = b.restartsLen - bi.offsetStart = b.restartsOffset - bi.offsetRealStart = b.restartsOffset - } - } - if slice.Limit != nil { - if bi.Seek(slice.Limit) && (!inclLimit || bi.Next()) { - bi.offsetLimit = bi.prevOffset - bi.riLimit = bi.restartIndex + 1 - } - } - bi.reset() - if bi.offsetStart > bi.offsetLimit { - bi.sErr(errors.New("leveldb/table: Reader: invalid slice range")) - } - } - return bi +func (b *block) Release() { + b.bpool.Put(b.data) + b.bpool = nil + b.data = nil } type dir int @@ -150,10 +124,12 @@ const ( ) type blockIter struct { - block *block - cache, releaser util.Releaser - key, value []byte - offset int + tr *Reader + block *block + blockReleaser util.Releaser + releaser util.Releaser + key, value []byte + offset int // Previous offset, only filled by Next. prevOffset int prevNode []int @@ -250,7 +226,7 @@ func (i *blockIter) Seek(key []byte) bool { return false } - ri, offset, err := i.block.seek(i.riStart, i.riLimit, key) + ri, offset, err := i.block.seek(i.tr.cmp, i.riStart, i.riLimit, key) if err != nil { i.sErr(err) return false @@ -261,7 +237,7 @@ func (i *blockIter) Seek(key []byte) bool { i.dir = dirForward } for i.Next() { - if i.block.cmp.Compare(i.key, key) >= 0 { + if i.tr.cmp.Compare(i.key, key) >= 0 { return true } } @@ -286,7 +262,7 @@ func (i *blockIter) Next() bool { for i.offset < i.offsetRealStart { key, value, nShared, n, err := i.block.entry(i.offset) if err != nil { - i.sErr(err) + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) return false } if n == 0 { @@ -300,13 +276,13 @@ func (i *blockIter) Next() bool { if i.offset >= i.offsetLimit { i.dir = dirEOI if i.offset != i.offsetLimit { - i.sErr(errors.New("leveldb/table: Reader: Next: invalid block (block entries offset not aligned)")) + i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned")) } return false } key, value, nShared, n, err := i.block.entry(i.offset) if err != nil { - i.sErr(err) + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) return false } if n == 0 { @@ -391,7 +367,7 @@ func (i *blockIter) Prev() bool { for { key, value, nShared, n, err := i.block.entry(offset) if err != nil { - i.sErr(err) + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) return false } if offset >= i.offsetRealStart { @@ -410,7 +386,7 @@ func (i *blockIter) Prev() bool { // Stop if target offset reached. if offset >= i.offset { if offset != i.offset { - i.sErr(errors.New("leveldb/table: Reader: Prev: invalid block (block entries offset not aligned)")) + i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned")) return false } @@ -437,25 +413,33 @@ func (i *blockIter) Value() []byte { } func (i *blockIter) Release() { - i.prevNode = nil - i.prevKeys = nil - i.key = nil - i.value = nil - i.dir = dirReleased - if i.cache != nil { - i.cache.Release() - i.cache = nil - } - if i.releaser != nil { - i.releaser.Release() - i.releaser = nil + if i.dir != dirReleased { + i.tr = nil + i.block = nil + i.prevNode = nil + i.prevKeys = nil + i.key = nil + i.value = nil + i.dir = dirReleased + if i.blockReleaser != nil { + i.blockReleaser.Release() + i.blockReleaser = nil + } + if i.releaser != nil { + i.releaser.Release() + i.releaser = nil + } } } func (i *blockIter) SetReleaser(releaser util.Releaser) { - if i.dir > dirReleased { - i.releaser = releaser + if i.dir == dirReleased { + panic(util.ErrReleased) } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) + } + i.releaser = releaser } func (i *blockIter) Valid() bool { @@ -467,21 +451,21 @@ func (i *blockIter) Error() error { } type filterBlock struct { - filter filter.Filter + bpool *util.BufferPool data []byte oOffset int baseLg uint filtersNum int } -func (b *filterBlock) contains(offset uint64, key []byte) bool { +func (b *filterBlock) contains(filter filter.Filter, offset uint64, key []byte) bool { i := int(offset >> b.baseLg) if i < b.filtersNum { o := b.data[b.oOffset+i*4:] n := int(binary.LittleEndian.Uint32(o)) m := int(binary.LittleEndian.Uint32(o[4:])) if n < m && m <= b.oOffset { - return b.filter.Contains(b.data[n:m], key) + return filter.Contains(b.data[n:m], key) } else if n == m { return false } @@ -489,12 +473,17 @@ func (b *filterBlock) contains(offset uint64, key []byte) bool { return true } +func (b *filterBlock) Release() { + b.bpool.Put(b.data) + b.bpool = nil + b.data = nil +} + type indexIter struct { - blockIter - tableReader *Reader - slice *util.Range + *blockIter + tr *Reader + slice *util.Range // Options - checksum bool fillCache bool } @@ -505,95 +494,173 @@ func (i *indexIter) Get() iterator.Iterator { } dataBH, n := decodeBlockHandle(value) if n == 0 { - return iterator.NewEmptyIterator(errors.New("leveldb/table: Reader: invalid table (bad data block handle)")) + return iterator.NewEmptyIterator(i.tr.newErrCorruptedBH(i.tr.indexBH, "bad data block handle")) } + var slice *util.Range if i.slice != nil && (i.blockIter.isFirst() || i.blockIter.isLast()) { slice = i.slice } - return i.tableReader.getDataIter(dataBH, slice, i.checksum, i.fillCache) + return i.tr.getDataIterErr(dataBH, slice, i.tr.verifyChecksum, i.fillCache) } // Reader is a table reader. type Reader struct { + mu sync.RWMutex + fi *storage.FileInfo reader io.ReaderAt - cache cache.Namespace + cache *cache.CacheGetter err error + bpool *util.BufferPool // Options - cmp comparer.Comparer - filter filter.Filter - checksum bool - strictIter bool + o *opt.Options + cmp comparer.Comparer + filter filter.Filter + verifyChecksum bool - dataEnd int64 - indexBlock *block - filterBlock *filterBlock + dataEnd int64 + metaBH, indexBH, filterBH blockHandle + indexBlock *block + filterBlock *filterBlock } -func verifyChecksum(data []byte) bool { - n := len(data) - 4 - checksum0 := binary.LittleEndian.Uint32(data[n:]) - checksum1 := util.NewCRC(data[:n]).Value() - return checksum0 == checksum1 +func (r *Reader) blockKind(bh blockHandle) string { + switch bh.offset { + case r.metaBH.offset: + return "meta-block" + case r.indexBH.offset: + return "index-block" + case r.filterBH.offset: + if r.filterBH.length > 0 { + return "filter-block" + } + } + return "data-block" } -func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) { - data := make([]byte, bh.length+blockTrailerLen) +func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error { + return &errors.ErrCorrupted{File: r.fi, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}} +} + +func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error { + return r.newErrCorrupted(int64(bh.offset), int64(bh.length), r.blockKind(bh), reason) +} + +func (r *Reader) fixErrCorruptedBH(bh blockHandle, err error) error { + if cerr, ok := err.(*ErrCorrupted); ok { + cerr.Pos = int64(bh.offset) + cerr.Size = int64(bh.length) + cerr.Kind = r.blockKind(bh) + return &errors.ErrCorrupted{File: r.fi, Err: cerr} + } + return err +} + +func (r *Reader) readRawBlock(bh blockHandle, verifyChecksum bool) ([]byte, error) { + data := r.bpool.Get(int(bh.length + blockTrailerLen)) if _, err := r.reader.ReadAt(data, int64(bh.offset)); err != nil && err != io.EOF { return nil, err } - if checksum || r.checksum { - if !verifyChecksum(data) { - return nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)") + + if verifyChecksum { + n := bh.length + 1 + checksum0 := binary.LittleEndian.Uint32(data[n:]) + checksum1 := util.NewCRC(data[:n]).Value() + if checksum0 != checksum1 { + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("checksum mismatch, want=%#x got=%#x", checksum0, checksum1)) } } + switch data[bh.length] { case blockTypeNoCompression: data = data[:bh.length] case blockTypeSnappyCompression: - var err error - data, err = snappy.Decode(nil, data[:bh.length]) + decLen, err := snappy.DecodedLen(data[:bh.length]) if err != nil { - return nil, err + return nil, r.newErrCorruptedBH(bh, err.Error()) + } + decData := r.bpool.Get(decLen) + decData, err = snappy.Decode(decData, data[:bh.length]) + r.bpool.Put(data) + if err != nil { + r.bpool.Put(decData) + return nil, r.newErrCorruptedBH(bh, err.Error()) } + data = decData default: - return nil, fmt.Errorf("leveldb/table: Reader: unknown block compression type: %d", data[bh.length]) + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("unknown compression type %#x", data[bh.length])) } return data, nil } -func (r *Reader) readBlock(bh blockHandle, checksum bool) (*block, error) { - data, err := r.readRawBlock(bh, checksum) +func (r *Reader) readBlock(bh blockHandle, verifyChecksum bool) (*block, error) { + data, err := r.readRawBlock(bh, verifyChecksum) if err != nil { return nil, err } restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:])) b := &block{ - cmp: r.cmp, + bpool: r.bpool, + bh: bh, data: data, restartsLen: restartsLen, restartsOffset: len(data) - (restartsLen+1)*4, - checksum: checksum || r.checksum, } return b, nil } -func (r *Reader) readFilterBlock(bh blockHandle, filter filter.Filter) (*filterBlock, error) { +func (r *Reader) readBlockCached(bh blockHandle, verifyChecksum, fillCache bool) (*block, util.Releaser, error) { + if r.cache != nil { + var ( + err error + ch *cache.Handle + ) + if fillCache { + ch = r.cache.Get(bh.offset, func() (size int, value cache.Value) { + var b *block + b, err = r.readBlock(bh, verifyChecksum) + if err != nil { + return 0, nil + } + return cap(b.data), b + }) + } else { + ch = r.cache.Get(bh.offset, nil) + } + if ch != nil { + b, ok := ch.Value().(*block) + if !ok { + ch.Release() + return nil, nil, errors.New("leveldb/table: inconsistent block type") + } + return b, ch, err + } else if err != nil { + return nil, nil, err + } + } + + b, err := r.readBlock(bh, verifyChecksum) + return b, b, err +} + +func (r *Reader) readFilterBlock(bh blockHandle) (*filterBlock, error) { data, err := r.readRawBlock(bh, true) if err != nil { return nil, err } n := len(data) if n < 5 { - return nil, errors.New("leveldb/table: Reader: invalid filter block (too short)") + return nil, r.newErrCorruptedBH(bh, "too short") } m := n - 5 oOffset := int(binary.LittleEndian.Uint32(data[m:])) if oOffset > m { - return nil, errors.New("leveldb/table: Reader: invalid filter block (invalid offset)") + return nil, r.newErrCorruptedBH(bh, "invalid data-offsets offset") } b := &filterBlock{ - filter: filter, + bpool: r.bpool, data: data, oOffset: oOffset, baseLg: uint(data[n-1]), @@ -602,44 +669,111 @@ func (r *Reader) readFilterBlock(bh blockHandle, filter filter.Filter) (*filterB return b, nil } -func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator { +func (r *Reader) readFilterBlockCached(bh blockHandle, fillCache bool) (*filterBlock, util.Releaser, error) { if r.cache != nil { - // Get/set block cache. - var err error - cache, ok := r.cache.Get(dataBH.offset, func() (ok bool, value interface{}, charge int, fin cache.SetFin) { - if !fillCache { - return + var ( + err error + ch *cache.Handle + ) + if fillCache { + ch = r.cache.Get(bh.offset, func() (size int, value cache.Value) { + var b *filterBlock + b, err = r.readFilterBlock(bh) + if err != nil { + return 0, nil + } + return cap(b.data), b + }) + } else { + ch = r.cache.Get(bh.offset, nil) + } + if ch != nil { + b, ok := ch.Value().(*filterBlock) + if !ok { + ch.Release() + return nil, nil, errors.New("leveldb/table: inconsistent block type") } - var dataBlock *block - dataBlock, err = r.readBlock(dataBH, checksum) - if err == nil { - ok = true - value = dataBlock - charge = int(dataBH.length) + return b, ch, err + } else if err != nil { + return nil, nil, err + } + } + + b, err := r.readFilterBlock(bh) + return b, b, err +} + +func (r *Reader) getIndexBlock(fillCache bool) (b *block, rel util.Releaser, err error) { + if r.indexBlock == nil { + return r.readBlockCached(r.indexBH, true, fillCache) + } + return r.indexBlock, util.NoopReleaser{}, nil +} + +func (r *Reader) getFilterBlock(fillCache bool) (*filterBlock, util.Releaser, error) { + if r.filterBlock == nil { + return r.readFilterBlockCached(r.filterBH, fillCache) + } + return r.filterBlock, util.NoopReleaser{}, nil +} + +func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Range, inclLimit bool) *blockIter { + bi := &blockIter{ + tr: r, + block: b, + blockReleaser: bReleaser, + // Valid key should never be nil. + key: make([]byte, 0), + dir: dirSOI, + riStart: 0, + riLimit: b.restartsLen, + offsetStart: 0, + offsetRealStart: 0, + offsetLimit: b.restartsOffset, + } + if slice != nil { + if slice.Start != nil { + if bi.Seek(slice.Start) { + bi.riStart = b.restartIndex(bi.restartIndex, b.restartsLen, bi.prevOffset) + bi.offsetStart = b.restartOffset(bi.riStart) + bi.offsetRealStart = bi.prevOffset + } else { + bi.riStart = b.restartsLen + bi.offsetStart = b.restartsOffset + bi.offsetRealStart = b.restartsOffset } - return - }) - if err != nil { - return iterator.NewEmptyIterator(err) } - if ok { - dataBlock := cache.Value().(*block) - if !dataBlock.checksum && (r.checksum || checksum) { - if !verifyChecksum(dataBlock.data) { - return iterator.NewEmptyIterator(errors.New("leveldb/table: Reader: invalid block (checksum mismatch)")) - } - dataBlock.checksum = true + if slice.Limit != nil { + if bi.Seek(slice.Limit) && (!inclLimit || bi.Next()) { + bi.offsetLimit = bi.prevOffset + bi.riLimit = bi.restartIndex + 1 } - iter := dataBlock.newIterator(slice, false, cache) - return iter + } + bi.reset() + if bi.offsetStart > bi.offsetLimit { + bi.sErr(errors.New("leveldb/table: invalid slice range")) } } - dataBlock, err := r.readBlock(dataBH, checksum) + return bi +} + +func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator { + b, rel, err := r.readBlockCached(dataBH, verifyChecksum, fillCache) if err != nil { return iterator.NewEmptyIterator(err) } - iter := dataBlock.newIterator(slice, false, nil) - return iter + return r.newBlockIter(b, rel, slice, false) +} + +func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + return iterator.NewEmptyIterator(r.err) + } + + return r.getDataIter(dataBH, slice, verifyChecksum, fillCache) } // NewIterator creates an iterator from the table. @@ -653,35 +787,44 @@ func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, checksum, fi // when not used. // // Also read Iterator documentation of the leveldb/iterator package. - func (r *Reader) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + r.mu.RLock() + defer r.mu.RUnlock() + if r.err != nil { return iterator.NewEmptyIterator(r.err) } + fillCache := !ro.GetDontFillCache() + indexBlock, rel, err := r.getIndexBlock(fillCache) + if err != nil { + return iterator.NewEmptyIterator(err) + } index := &indexIter{ - blockIter: *r.indexBlock.newIterator(slice, true, nil), - tableReader: r, - slice: slice, - checksum: ro.GetStrict(opt.StrictBlockChecksum), - fillCache: !ro.GetDontFillCache(), + blockIter: r.newBlockIter(indexBlock, rel, slice, true), + tr: r, + slice: slice, + fillCache: !ro.GetDontFillCache(), } - return iterator.NewIndexedIterator(index, r.strictIter || ro.GetStrict(opt.StrictIterator), false) + return iterator.NewIndexedIterator(index, opt.GetStrict(r.o, ro, opt.StrictReader)) } -// Find finds key/value pair whose key is greater than or equal to the -// given key. It returns ErrNotFound if the table doesn't contain -// such pair. -// -// The caller should not modify the contents of the returned slice, but -// it is safe to modify the contents of the argument after Find returns. -func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err error) { +func (r *Reader) find(key []byte, filtered bool, ro *opt.ReadOptions, noValue bool) (rkey, value []byte, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + if r.err != nil { err = r.err return } - index := r.indexBlock.newIterator(nil, true, nil) + indexBlock, rel, err := r.getIndexBlock(true) + if err != nil { + return + } + defer rel.Release() + + index := r.newBlockIter(indexBlock, nil, nil, true) defer index.Release() if !index.Seek(key) { err = index.Error() @@ -692,14 +835,23 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err } dataBH, n := decodeBlockHandle(index.Value()) if n == 0 { - err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)") + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") return } - if r.filterBlock != nil && !r.filterBlock.contains(dataBH.offset, key) { - err = ErrNotFound - return + if filtered && r.filter != nil { + filterBlock, frel, ferr := r.getFilterBlock(true) + if ferr == nil { + if !filterBlock.contains(r.filter, dataBH.offset, key) { + frel.Release() + return nil, nil, ErrNotFound + } + frel.Release() + } else if !errors.IsCorrupted(ferr) { + err = ferr + return + } } - data := r.getDataIter(dataBH, nil, ro.GetStrict(opt.StrictBlockChecksum), !ro.GetDontFillCache()) + data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache()) defer data.Release() if !data.Seek(key) { err = data.Error() @@ -708,23 +860,64 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err } return } + // Don't use block buffer, no need to copy the buffer. rkey = data.Key() - value = data.Value() + if !noValue { + if r.bpool == nil { + value = data.Value() + } else { + // Use block buffer, and since the buffer will be recycled, the buffer + // need to be copied. + value = append([]byte{}, data.Value()...) + } + } + return +} + +// Find finds key/value pair whose key is greater than or equal to the +// given key. It returns ErrNotFound if the table doesn't contain +// such pair. +// If filtered is true then the nearest 'block' will be checked against +// 'filter data' (if present) and will immediately return ErrNotFound if +// 'filter data' indicates that such pair doesn't exist. +// +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. +func (r *Reader) Find(key []byte, filtered bool, ro *opt.ReadOptions) (rkey, value []byte, err error) { + return r.find(key, filtered, ro, false) +} + +// Find finds key that is greater than or equal to the given key. +// It returns ErrNotFound if the table doesn't contain such key. +// If filtered is true then the nearest 'block' will be checked against +// 'filter data' (if present) and will immediately return ErrNotFound if +// 'filter data' indicates that such key doesn't exist. +// +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. +func (r *Reader) FindKey(key []byte, filtered bool, ro *opt.ReadOptions) (rkey []byte, err error) { + rkey, _, err = r.find(key, filtered, ro, true) return } // Get gets the value for the given key. It returns errors.ErrNotFound // if the table does not contain the key. // -// The caller should not modify the contents of the returned slice, but -// it is safe to modify the contents of the argument after Get returns. +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. func (r *Reader) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + if r.err != nil { err = r.err return } - rkey, value, err := r.Find(key, ro) + rkey, value, err := r.find(key, false, ro, false) if err == nil && r.cmp.Compare(rkey, key) != 0 { value = nil err = ErrNotFound @@ -736,17 +929,26 @@ func (r *Reader) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) // // It is safe to modify the contents of the argument after Get returns. func (r *Reader) OffsetOf(key []byte) (offset int64, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + if r.err != nil { err = r.err return } - index := r.indexBlock.newIterator(nil, true, nil) + indexBlock, rel, err := r.readBlockCached(r.indexBH, true, true) + if err != nil { + return + } + defer rel.Release() + + index := r.newBlockIter(indexBlock, nil, nil, true) defer index.Release() if index.Seek(key) { dataBH, n := decodeBlockHandle(index.Value()) if n == 0 { - err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)") + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") return } offset = int64(dataBH.offset) @@ -759,90 +961,147 @@ func (r *Reader) OffsetOf(key []byte) (offset int64, err error) { return } -// NewReader creates a new initialized table reader for the file. -// The cache is optional and can be nil. -func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, o *opt.Options) *Reader { - r := &Reader{ - reader: f, - cache: cache, - cmp: o.GetComparer(), - checksum: o.GetStrict(opt.StrictBlockChecksum), - strictIter: o.GetStrict(opt.StrictIterator), +// Release implements util.Releaser. +// It also close the file if it is an io.Closer. +func (r *Reader) Release() { + r.mu.Lock() + defer r.mu.Unlock() + + if closer, ok := r.reader.(io.Closer); ok { + closer.Close() + } + if r.indexBlock != nil { + r.indexBlock.Release() + r.indexBlock = nil } + if r.filterBlock != nil { + r.filterBlock.Release() + r.filterBlock = nil + } + r.reader = nil + r.cache = nil + r.bpool = nil + r.err = ErrReaderReleased +} + +// NewReader creates a new initialized table reader for the file. +// The fi, cache and bpool is optional and can be nil. +// +// The returned table reader instance is goroutine-safe. +func NewReader(f io.ReaderAt, size int64, fi *storage.FileInfo, cache *cache.CacheGetter, bpool *util.BufferPool, o *opt.Options) (*Reader, error) { if f == nil { - r.err = errors.New("leveldb/table: Reader: nil file") - return r + return nil, errors.New("leveldb/table: nil file") } + + r := &Reader{ + fi: fi, + reader: f, + cache: cache, + bpool: bpool, + o: o, + cmp: o.GetComparer(), + verifyChecksum: o.GetStrict(opt.StrictBlockChecksum), + } + if size < footerLen { - r.err = errors.New("leveldb/table: Reader: invalid table (file size is too small)") - return r + r.err = r.newErrCorrupted(0, size, "table", "too small") + return r, nil } + + footerPos := size - footerLen var footer [footerLen]byte - if _, err := r.reader.ReadAt(footer[:], size-footerLen); err != nil && err != io.EOF { - r.err = fmt.Errorf("leveldb/table: Reader: invalid table (could not read footer): %v", err) + if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF { + return nil, err } if string(footer[footerLen-len(magic):footerLen]) != magic { - r.err = errors.New("leveldb/table: Reader: invalid table (bad magic number)") - return r + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number") + return r, nil } + + var n int // Decode the metaindex block handle. - metaBH, n := decodeBlockHandle(footer[:]) + r.metaBH, n = decodeBlockHandle(footer[:]) if n == 0 { - r.err = errors.New("leveldb/table: Reader: invalid table (bad metaindex block handle)") - return r + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle") + return r, nil } + // Decode the index block handle. - indexBH, n := decodeBlockHandle(footer[n:]) + r.indexBH, n = decodeBlockHandle(footer[n:]) if n == 0 { - r.err = errors.New("leveldb/table: Reader: invalid table (bad index block handle)") - return r - } - // Read index block. - r.indexBlock, r.err = r.readBlock(indexBH, true) - if r.err != nil { - return r + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle") + return r, nil } + // Read metaindex block. - metaBlock, err := r.readBlock(metaBH, true) + metaBlock, err := r.readBlock(r.metaBH, true) if err != nil { - r.err = err - return r + if errors.IsCorrupted(err) { + r.err = err + return r, nil + } else { + return nil, err + } } + // Set data end. - r.dataEnd = int64(metaBH.offset) - metaIter := metaBlock.newIterator(nil, false, nil) + r.dataEnd = int64(r.metaBH.offset) + + // Read metaindex. + metaIter := r.newBlockIter(metaBlock, nil, nil, true) for metaIter.Next() { key := string(metaIter.Key()) if !strings.HasPrefix(key, "filter.") { continue } fn := key[7:] - var filter filter.Filter if f0 := o.GetFilter(); f0 != nil && f0.Name() == fn { - filter = f0 + r.filter = f0 } else { for _, f0 := range o.GetAltFilters() { if f0.Name() == fn { - filter = f0 + r.filter = f0 break } } } - if filter != nil { + if r.filter != nil { filterBH, n := decodeBlockHandle(metaIter.Value()) if n == 0 { continue } + r.filterBH = filterBH // Update data end. r.dataEnd = int64(filterBH.offset) - filterBlock, err := r.readFilterBlock(filterBH, filter) - if err != nil { - continue - } - r.filterBlock = filterBlock break } } metaIter.Release() - return r + metaBlock.Release() + + // Cache index and filter block locally, since we don't have global cache. + if cache == nil { + r.indexBlock, err = r.readBlock(r.indexBH, true) + if err != nil { + if errors.IsCorrupted(err) { + r.err = err + return r, nil + } else { + return nil, err + } + } + if r.filter != nil { + r.filterBlock, err = r.readFilterBlock(r.filterBH) + if err != nil { + if !errors.IsCorrupted(err) { + return nil, err + } + + // Don't use filter then. + r.filter = nil + } + } + } + + return r, nil } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go index c0ac70d9e..beacdc1f0 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go @@ -133,9 +133,9 @@ Filter block trailer: +- 4-bytes -+ / \ - +---------------+---------------+---------------+-------------------------+------------------+ - | offset 1 | .... | offset n | filter offset (4-bytes) | base Lg (1-byte) | - +-------------- +---------------+---------------+-------------------------+------------------+ + +---------------+---------------+---------------+-------------------------------+------------------+ + | data 1 offset | .... | data n offset | data-offsets offset (4-bytes) | base Lg (1-byte) | + +-------------- +---------------+---------------+-------------------------------+------------------+ NOTE: All fixed-length integer are little-endian. diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_suite_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_suite_test.go index bc9eb83cc..6465da6e3 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_suite_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_suite_test.go @@ -3,15 +3,9 @@ package table import ( "testing" - . "github.com/onsi/ginkgo" - . "github.com/onsi/gomega" - "github.com/syndtr/goleveldb/leveldb/testutil" ) func TestTable(t *testing.T) { - testutil.RunDefer() - - RegisterFailHandler(Fail) - RunSpecs(t, "Table Suite") + testutil.RunSuite(t, "Table Suite") } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go index d7d3b2a4b..4b59b31f5 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go @@ -23,7 +23,7 @@ type tableWrapper struct { } func (t tableWrapper) TestFind(key []byte) (rkey, rvalue []byte, err error) { - return t.Reader.Find(key, nil) + return t.Reader.Find(key, false, nil) } func (t tableWrapper) TestGet(key []byte) (value []byte, err error) { @@ -59,7 +59,8 @@ var _ = testutil.Defer(func() { It("Should be able to approximate offset of a key correctly", func() { Expect(err).ShouldNot(HaveOccurred()) - tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, o) + tr, err := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o) + Expect(err).ShouldNot(HaveOccurred()) CheckOffset := func(key string, expect, threshold int) { offset, err := tr.OffsetOf([]byte(key)) Expect(err).ShouldNot(HaveOccurred()) @@ -95,7 +96,7 @@ var _ = testutil.Defer(func() { tw.Close() // Opening the table. - tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, o) + tr, _ := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o) return tableWrapper{tr} } Test := func(kv *testutil.KeyValue, body func(r *Reader)) func() { @@ -104,14 +105,16 @@ var _ = testutil.Defer(func() { if body != nil { body(db.(tableWrapper).Reader) } - testutil.KeyValueTesting(nil, db, *kv) + testutil.KeyValueTesting(nil, *kv, db, nil, nil) } } - testutil.AllKeyValueTesting(nil, Build) + testutil.AllKeyValueTesting(nil, Build, nil, nil) Describe("with one key per block", Test(testutil.KeyValue_Generate(nil, 9, 1, 10, 512, 512), func(r *Reader) { It("should have correct blocks number", func() { - Expect(r.indexBlock.restartsLen).Should(Equal(9)) + indexBlock, err := r.readBlock(r.indexBH, true) + Expect(err).To(BeNil()) + Expect(indexBlock.restartsLen).Should(Equal(9)) }) })) }) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go index 4e19e93a9..274c95fad 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go @@ -12,7 +12,7 @@ import ( "fmt" "io" - "code.google.com/p/snappy-go/snappy" + "github.com/syndtr/gosnappy/snappy" "github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/filter" diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go index 4b87b5ef6..ec3f177a1 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go @@ -12,6 +12,7 @@ import ( . "github.com/onsi/gomega" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/util" ) @@ -34,6 +35,10 @@ type Get interface { TestGet(key []byte) (value []byte, err error) } +type Has interface { + TestHas(key []byte) (ret bool, err error) +} + type NewIterator interface { TestNewIterator(slice *util.Range) iterator.Iterator } @@ -110,7 +115,7 @@ func (t *DBTesting) TestAllPresent() { func (t *DBTesting) TestDeletedKey(key []byte) { _, err := t.DB.TestGet(key) - Expect(err).Should(Equal(util.ErrNotFound), "Get on deleted key %q, %s", key, t.text()) + Expect(err).Should(Equal(errors.ErrNotFound), "Get on deleted key %q, %s", key, t.text()) } func (t *DBTesting) TestAllDeleted() { @@ -212,5 +217,6 @@ func DoDBTesting(t *DBTesting) { } DoIteratorTesting(&it) + iter.Release() } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/ginkgo.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/ginkgo.go new file mode 100644 index 000000000..82f3d0e81 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/ginkgo.go @@ -0,0 +1,21 @@ +package testutil + +import ( + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +func RunSuite(t GinkgoTestingT, name string) { + RunDefer() + + SynchronizedBeforeSuite(func() []byte { + RunDefer("setup") + return nil + }, func(data []byte) {}) + SynchronizedAfterSuite(func() { + RunDefer("teardown") + }, func() {}) + + RegisterFailHandler(Fail) + RunSpecs(t, name) +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go index 4fc75b6f2..a0b58f0e7 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go @@ -13,16 +13,28 @@ import ( . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/util" ) -func KeyValueTesting(rnd *rand.Rand, p DB, kv KeyValue) { +func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB, teardown func(DB)) { if rnd == nil { rnd = NewRand() } - if db, ok := p.(Find); ok { - It("Should find all keys with Find", func() { + if p == nil { + BeforeEach(func() { + p = setup(kv) + }) + if teardown != nil { + AfterEach(func() { + teardown(p) + }) + } + } + + It("Should find all keys with Find", func() { + if db, ok := p.(Find); ok { ShuffledIndex(nil, kv.Len(), 1, func(i int) { key_, key, value := kv.IndexInexact(i) @@ -38,9 +50,11 @@ func KeyValueTesting(rnd *rand.Rand, p DB, kv KeyValue) { Expect(rkey).Should(Equal(key)) Expect(rvalue).Should(Equal(value), "Value for key %q (%q)", key_, key) }) - }) + } + }) - It("Should return error if the key is not present", func() { + It("Should return error if the key is not present", func() { + if db, ok := p.(Find); ok { var key []byte if kv.Len() > 0 { key_, _ := kv.Index(kv.Len() - 1) @@ -48,12 +62,12 @@ func KeyValueTesting(rnd *rand.Rand, p DB, kv KeyValue) { } rkey, _, err := db.TestFind(key) Expect(err).Should(HaveOccurred(), "Find for key %q yield key %q", key, rkey) - Expect(err).Should(Equal(util.ErrNotFound)) - }) - } + Expect(err).Should(Equal(errors.ErrNotFound)) + } + }) - if db, ok := p.(Get); ok { - It("Should only find exact key with Get", func() { + It("Should only find exact key with Get", func() { + if db, ok := p.(Get); ok { ShuffledIndex(nil, kv.Len(), 1, func(i int) { key_, key, value := kv.IndexInexact(i) @@ -66,14 +80,34 @@ func KeyValueTesting(rnd *rand.Rand, p DB, kv KeyValue) { if len(key_) > 0 { _, err = db.TestGet(key_) Expect(err).Should(HaveOccurred(), "Error for key %q", key_) - Expect(err).Should(Equal(util.ErrNotFound)) + Expect(err).Should(Equal(errors.ErrNotFound)) } }) - }) - } + } + }) + + It("Should only find present key with Has", func() { + if db, ok := p.(Has); ok { + ShuffledIndex(nil, kv.Len(), 1, func(i int) { + key_, key, _ := kv.IndexInexact(i) + + // Using exact key. + ret, err := db.TestHas(key) + Expect(err).ShouldNot(HaveOccurred(), "Error for key %q", key) + Expect(ret).Should(BeTrue(), "False for key %q", key) - if db, ok := p.(NewIterator); ok { - TestIter := func(r *util.Range, _kv KeyValue) { + // Using inexact key. + if len(key_) > 0 { + ret, err = db.TestHas(key_) + Expect(err).ShouldNot(HaveOccurred(), "Error for key %q", key_) + Expect(ret).ShouldNot(BeTrue(), "True for key %q", key) + } + }) + } + }) + + TestIter := func(r *util.Range, _kv KeyValue) { + if db, ok := p.(NewIterator); ok { iter := db.TestNewIterator(r) Expect(iter.Error()).ShouldNot(HaveOccurred()) @@ -83,46 +117,62 @@ func KeyValueTesting(rnd *rand.Rand, p DB, kv KeyValue) { } DoIteratorTesting(&t) + iter.Release() } + } - It("Should iterates and seeks correctly", func(done Done) { - TestIter(nil, kv.Clone()) - done <- true - }, 3.0) - - RandomIndex(rnd, kv.Len(), kv.Len(), func(i int) { - type slice struct { - r *util.Range - start, limit int - } + It("Should iterates and seeks correctly", func(done Done) { + TestIter(nil, kv.Clone()) + done <- true + }, 3.0) - key_, _, _ := kv.IndexInexact(i) - for _, x := range []slice{ - {&util.Range{Start: key_, Limit: nil}, i, kv.Len()}, - {&util.Range{Start: nil, Limit: key_}, 0, i}, - } { - It(fmt.Sprintf("Should iterates and seeks correctly of a slice %d .. %d", x.start, x.limit), func(done Done) { - TestIter(x.r, kv.Slice(x.start, x.limit)) - done <- true - }, 3.0) - } - }) + RandomIndex(rnd, kv.Len(), Min(kv.Len(), 50), func(i int) { + type slice struct { + r *util.Range + start, limit int + } - RandomRange(rnd, kv.Len(), kv.Len(), func(start, limit int) { - It(fmt.Sprintf("Should iterates and seeks correctly of a slice %d .. %d", start, limit), func(done Done) { - r := kv.Range(start, limit) - TestIter(&r, kv.Slice(start, limit)) + key_, _, _ := kv.IndexInexact(i) + for _, x := range []slice{ + {&util.Range{Start: key_, Limit: nil}, i, kv.Len()}, + {&util.Range{Start: nil, Limit: key_}, 0, i}, + } { + It(fmt.Sprintf("Should iterates and seeks correctly of a slice %d .. %d", x.start, x.limit), func(done Done) { + TestIter(x.r, kv.Slice(x.start, x.limit)) done <- true }, 3.0) - }) - } + } + }) + + RandomRange(rnd, kv.Len(), Min(kv.Len(), 50), func(start, limit int) { + It(fmt.Sprintf("Should iterates and seeks correctly of a slice %d .. %d", start, limit), func(done Done) { + r := kv.Range(start, limit) + TestIter(&r, kv.Slice(start, limit)) + done <- true + }, 3.0) + }) } -func AllKeyValueTesting(rnd *rand.Rand, body func(kv KeyValue) DB) { +func AllKeyValueTesting(rnd *rand.Rand, body, setup func(KeyValue) DB, teardown func(DB)) { Test := func(kv *KeyValue) func() { return func() { - db := body(*kv) - KeyValueTesting(rnd, db, *kv) + var p DB + if setup != nil { + Defer("setup", func() { + p = setup(*kv) + }) + } + if teardown != nil { + Defer("teardown", func() { + teardown(p) + }) + } + if body != nil { + p = body(*kv) + } + KeyValueTesting(rnd, *kv, p, func(KeyValue) DB { + return p + }, nil) } } @@ -133,4 +183,5 @@ func AllKeyValueTesting(rnd *rand.Rand, body func(kv KeyValue) DB) { Describe("with big value", Test(KeyValue_BigValue())) Describe("with special key", Test(KeyValue_SpecialKey())) Describe("with multiple key/value", Test(KeyValue_MultipleKeyValue())) + Describe("with generated key/value", Test(KeyValue_Generate(nil, 120, 1, 50, 10, 120))) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/storage.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/storage.go index 0f8d77a73..59c496d54 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/storage.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/storage.go @@ -397,6 +397,7 @@ func (s *Storage) logI(format string, args ...interface{}) { func (s *Storage) Log(str string) { s.log(1, "Log: "+str) + s.Storage.Log(str) } func (s *Storage) Lock() (r util.Releaser, err error) { diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/util.go index 38fe25d52..97c5294b1 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/util.go @@ -155,3 +155,17 @@ func RandomRange(rnd *rand.Rand, n, round int, fn func(start, limit int)) { } return } + +func Max(x, y int) int { + if x > y { + return x + } + return y +} + +func Min(x, y int) int { + if x < y { + return x + } + return y +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil_test.go index c1402fda3..25bf2b29f 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil_test.go @@ -34,6 +34,10 @@ func (t *testingDB) TestGet(key []byte) (value []byte, err error) { return t.Get(key, t.ro) } +func (t *testingDB) TestHas(key []byte) (ret bool, err error) { + return t.Has(key, t.ro) +} + func (t *testingDB) TestNewIterator(slice *util.Range) iterator.Iterator { return t.NewIterator(slice, t.ro) } @@ -48,6 +52,7 @@ func (t *testingDB) TestClose() { func newTestingDB(o *opt.Options, ro *opt.ReadOptions, wo *opt.WriteOptions) *testingDB { stor := testutil.NewStorage() db, err := Open(stor, o) + // FIXME: This may be called from outside It, which may cause panic. Expect(err).NotTo(HaveOccurred()) return &testingDB{ DB: db, diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go index a43d2e460..1a5bf71a3 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go @@ -14,10 +14,10 @@ import ( ) func shorten(str string) string { - if len(str) <= 4 { + if len(str) <= 8 { return str } - return str[:1] + ".." + str[len(str)-1:] + return str[:3] + ".." + str[len(str)-3:] } var bunits = [...]string{"", "Ki", "Mi", "Gi"} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go new file mode 100644 index 000000000..2b8453d75 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go @@ -0,0 +1,238 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +import ( + "fmt" + "sync" + "sync/atomic" + "time" +) + +type buffer struct { + b []byte + miss int +} + +// BufferPool is a 'buffer pool'. +type BufferPool struct { + pool [6]chan []byte + size [5]uint32 + sizeMiss [5]uint32 + sizeHalf [5]uint32 + baseline [4]int + baseline0 int + + mu sync.RWMutex + closed bool + closeC chan struct{} + + get uint32 + put uint32 + half uint32 + less uint32 + equal uint32 + greater uint32 + miss uint32 +} + +func (p *BufferPool) poolNum(n int) int { + if n <= p.baseline0 && n > p.baseline0/2 { + return 0 + } + for i, x := range p.baseline { + if n <= x { + return i + 1 + } + } + return len(p.baseline) + 1 +} + +// Get returns buffer with length of n. +func (p *BufferPool) Get(n int) []byte { + if p == nil { + return make([]byte, n) + } + + p.mu.RLock() + defer p.mu.RUnlock() + + if p.closed { + return make([]byte, n) + } + + atomic.AddUint32(&p.get, 1) + + poolNum := p.poolNum(n) + pool := p.pool[poolNum] + if poolNum == 0 { + // Fast path. + select { + case b := <-pool: + switch { + case cap(b) > n: + if cap(b)-n >= n { + atomic.AddUint32(&p.half, 1) + select { + case pool <- b: + default: + } + return make([]byte, n) + } else { + atomic.AddUint32(&p.less, 1) + return b[:n] + } + case cap(b) == n: + atomic.AddUint32(&p.equal, 1) + return b[:n] + default: + atomic.AddUint32(&p.greater, 1) + } + default: + atomic.AddUint32(&p.miss, 1) + } + + return make([]byte, n, p.baseline0) + } else { + sizePtr := &p.size[poolNum-1] + + select { + case b := <-pool: + switch { + case cap(b) > n: + if cap(b)-n >= n { + atomic.AddUint32(&p.half, 1) + sizeHalfPtr := &p.sizeHalf[poolNum-1] + if atomic.AddUint32(sizeHalfPtr, 1) == 20 { + atomic.StoreUint32(sizePtr, uint32(cap(b)/2)) + atomic.StoreUint32(sizeHalfPtr, 0) + } else { + select { + case pool <- b: + default: + } + } + return make([]byte, n) + } else { + atomic.AddUint32(&p.less, 1) + return b[:n] + } + case cap(b) == n: + atomic.AddUint32(&p.equal, 1) + return b[:n] + default: + atomic.AddUint32(&p.greater, 1) + if uint32(cap(b)) >= atomic.LoadUint32(sizePtr) { + select { + case pool <- b: + default: + } + } + } + default: + atomic.AddUint32(&p.miss, 1) + } + + if size := atomic.LoadUint32(sizePtr); uint32(n) > size { + if size == 0 { + atomic.CompareAndSwapUint32(sizePtr, 0, uint32(n)) + } else { + sizeMissPtr := &p.sizeMiss[poolNum-1] + if atomic.AddUint32(sizeMissPtr, 1) == 20 { + atomic.StoreUint32(sizePtr, uint32(n)) + atomic.StoreUint32(sizeMissPtr, 0) + } + } + return make([]byte, n) + } else { + return make([]byte, n, size) + } + } +} + +// Put adds given buffer to the pool. +func (p *BufferPool) Put(b []byte) { + if p == nil { + return + } + + p.mu.RLock() + defer p.mu.RUnlock() + + if p.closed { + return + } + + atomic.AddUint32(&p.put, 1) + + pool := p.pool[p.poolNum(cap(b))] + select { + case pool <- b: + default: + } + +} + +func (p *BufferPool) Close() { + if p == nil { + return + } + + p.mu.Lock() + if !p.closed { + p.closed = true + p.closeC <- struct{}{} + } + p.mu.Unlock() +} + +func (p *BufferPool) String() string { + if p == nil { + return "" + } + + return fmt.Sprintf("BufferPool{B·%d Z·%v Zm·%v Zh·%v G·%d P·%d H·%d <·%d =·%d >·%d M·%d}", + p.baseline0, p.size, p.sizeMiss, p.sizeHalf, p.get, p.put, p.half, p.less, p.equal, p.greater, p.miss) +} + +func (p *BufferPool) drain() { + ticker := time.NewTicker(2 * time.Second) + for { + select { + case <-ticker.C: + for _, ch := range p.pool { + select { + case <-ch: + default: + } + } + case <-p.closeC: + close(p.closeC) + for _, ch := range p.pool { + close(ch) + } + return + } + } +} + +// NewBufferPool creates a new initialized 'buffer pool'. +func NewBufferPool(baseline int) *BufferPool { + if baseline <= 0 { + panic("baseline can't be <= 0") + } + p := &BufferPool{ + baseline0: baseline, + baseline: [...]int{baseline / 4, baseline / 2, baseline * 2, baseline * 4}, + closeC: make(chan struct{}, 1), + } + for i, cap := range []int{2, 2, 4, 4, 2, 1} { + p.pool[i] = make(chan []byte, cap) + } + go p.drain() + return p +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/pool.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/pool.go new file mode 100644 index 000000000..1f7fdd41f --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/pool.go @@ -0,0 +1,21 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build go1.3 + +package util + +import ( + "sync" +) + +type Pool struct { + sync.Pool +} + +func NewPool(cap int) *Pool { + return &Pool{} +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/pool_legacy.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/pool_legacy.go new file mode 100644 index 000000000..27b8d03be --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/pool_legacy.go @@ -0,0 +1,33 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build !go1.3 + +package util + +type Pool struct { + pool chan interface{} +} + +func (p *Pool) Get() interface{} { + select { + case x := <-p.pool: + return x + default: + return nil + } +} + +func (p *Pool) Put(x interface{}) { + select { + case p.pool <- x: + default: + } +} + +func NewPool(cap int) *Pool { + return &Pool{pool: make(chan interface{}, cap)} +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/range.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/range.go index da0583123..85159583d 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/range.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/range.go @@ -14,3 +14,19 @@ type Range struct { // Limit of the key range, not include in the range. Limit []byte } + +// BytesPrefix returns key range that satisfy the given prefix. +// This only applicable for the standard 'bytes comparer'. +func BytesPrefix(prefix []byte) *Range { + var limit []byte + for i := len(prefix) - 1; i >= 0; i-- { + c := prefix[i] + if c < 0xff { + limit = make([]byte, i+1) + copy(limit, prefix) + limit[i] = c + 1 + break + } + } + return &Range{prefix, limit} +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go index 229c7d41f..f35976865 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go @@ -12,7 +12,8 @@ import ( ) var ( - ErrNotFound = errors.New("leveldb: not found") + ErrReleased = errors.New("leveldb: resource already relesed") + ErrHasReleaser = errors.New("leveldb: releaser already defined") ) // Releaser is the interface that wraps the basic Release method. @@ -27,23 +28,46 @@ type ReleaseSetter interface { // SetReleaser associates the given releaser to the resources. The // releaser will be called once coresponding resources released. // Calling SetReleaser with nil will clear the releaser. + // + // This will panic if a releaser already present or coresponding + // resource is already released. Releaser should be cleared first + // before assigned a new one. SetReleaser(releaser Releaser) } // BasicReleaser provides basic implementation of Releaser and ReleaseSetter. type BasicReleaser struct { releaser Releaser + released bool +} + +// Released returns whether Release method already called. +func (r *BasicReleaser) Released() bool { + return r.released } // Release implements Releaser.Release. func (r *BasicReleaser) Release() { - if r.releaser != nil { - r.releaser.Release() - r.releaser = nil + if !r.released { + if r.releaser != nil { + r.releaser.Release() + r.releaser = nil + } + r.released = true } } // SetReleaser implements ReleaseSetter.SetReleaser. func (r *BasicReleaser) SetReleaser(releaser Releaser) { + if r.released { + panic(ErrReleased) + } + if r.releaser != nil && releaser != nil { + panic(ErrHasReleaser) + } r.releaser = releaser } + +type NoopReleaser struct{} + +func (NoopReleaser) Release() {} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go index 4c54d6480..88a52f53e 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go @@ -7,7 +7,6 @@ package leveldb import ( - "errors" "sync/atomic" "unsafe" @@ -16,19 +15,6 @@ import ( "github.com/syndtr/goleveldb/leveldb/util" ) -var levelMaxSize [kNumLevels]float64 - -func init() { - // Precompute max size of each level - for level := range levelMaxSize { - res := float64(10 * 1048576) - for n := level; n > 1; n-- { - res *= 10 - } - levelMaxSize[level] = res - } -} - type tSet struct { level int table *tFile @@ -37,21 +23,26 @@ type tSet struct { type version struct { s *session - tables [kNumLevels]tFiles + tables []tFiles // Level that should be compacted next and its compaction score. - // Score < 1 means compaction is not strictly needed. These fields - // are initialized by ComputeCompaction() + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by computeCompaction() cLevel int cScore float64 cSeek unsafe.Pointer - ref int + ref int + // Succeeding version. next *version } -func (v *version) release_NB() { +func newVersion(s *session) *version { + return &version{s: s, tables: make([]tFiles, s.o.GetNumLevel())} +} + +func (v *version) releaseNB() { v.ref-- if v.ref > 0 { return @@ -60,8 +51,6 @@ func (v *version) release_NB() { panic("negative version ref") } - s := v.s - tables := make(map[uint64]bool) for _, tt := range v.next.tables { for _, t := range tt { @@ -74,145 +63,184 @@ func (v *version) release_NB() { for _, t := range tt { num := t.file.Num() if _, ok := tables[num]; !ok { - s.tops.remove(t) + v.s.tops.remove(t) } } } - v.next.release_NB() + v.next.releaseNB() v.next = nil } func (v *version) release() { v.s.vmu.Lock() - v.release_NB() + v.releaseNB() v.s.vmu.Unlock() } -func (v *version) get(key iKey, ro *opt.ReadOptions) (value []byte, cstate bool, err error) { - s := v.s - - ukey := key.ukey() +func (v *version) walkOverlapping(ikey iKey, f func(level int, t *tFile) bool, lf func(level int) bool) { + ukey := ikey.ukey() - var tset *tSet - tseek := true - - // We can search level-by-level since entries never hop across - // levels. Therefore we are guaranteed that if we find data - // in an smaller level, later levels are irrelevant. - for level, ts := range v.tables { - if len(ts) == 0 { + // Walk tables level-by-level. + for level, tables := range v.tables { + if len(tables) == 0 { continue } if level == 0 { // Level-0 files may overlap each other. Find all files that - // overlap user_key and process them in order from newest to - var tmp tFiles - for _, t := range ts { - if s.icmp.uCompare(ukey, t.min.ukey()) >= 0 && - s.icmp.uCompare(ukey, t.max.ukey()) <= 0 { - tmp = append(tmp, t) + // overlap ukey. + for _, t := range tables { + if t.overlaps(v.s.icmp, ukey, ukey) { + if !f(level, t) { + return + } } } - - if len(tmp) == 0 { - continue - } - - tmp.sortByNum() - ts = tmp } else { - i := ts.searchMax(key, s.icmp) - if i >= len(ts) || s.icmp.uCompare(ukey, ts[i].min.ukey()) < 0 { - continue + if i := tables.searchMax(v.s.icmp, ikey); i < len(tables) { + t := tables[i] + if v.s.icmp.uCompare(ukey, t.imin.ukey()) >= 0 { + if !f(level, t) { + return + } + } } + } - ts = ts[i : i+1] + if lf != nil && !lf(level) { + return } + } +} - var l0found bool - var l0seq uint64 - var l0type vType - var l0value []byte - for _, t := range ts { - if tseek { - if tset == nil { - tset = &tSet{level, t} - } else if tset.table.incrSeek() <= 0 { - cstate = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset)) - tseek = false - } - } +func (v *version) get(ikey iKey, ro *opt.ReadOptions, noValue bool) (value []byte, tcomp bool, err error) { + ukey := ikey.ukey() - var _rkey, rval []byte - _rkey, rval, err = s.tops.get(t, key, ro) - if err == ErrNotFound { - continue - } else if err != nil { - return + var ( + tset *tSet + tseek bool + + // Level-0. + zfound bool + zseq uint64 + zkt kType + zval []byte + ) + + err = ErrNotFound + + // Since entries never hope across level, finding key/value + // in smaller level make later levels irrelevant. + v.walkOverlapping(ikey, func(level int, t *tFile) bool { + if !tseek { + if tset == nil { + tset = &tSet{level, t} + } else { + tseek = true } + } - rkey := iKey(_rkey) - if seq, t, ok := rkey.parseNum(); ok { - if s.icmp.uCompare(ukey, rkey.ukey()) == 0 { - if level == 0 { - if seq >= l0seq { - l0found = true - l0seq = seq - l0type = t - l0value = rval - } - } else { - switch t { - case tVal: - value = rval - case tDel: - err = ErrNotFound - default: - panic("invalid type") - } - return + var ( + fikey, fval []byte + ferr error + ) + if noValue { + fikey, ferr = v.s.tops.findKey(t, ikey, ro) + } else { + fikey, fval, ferr = v.s.tops.find(t, ikey, ro) + } + switch ferr { + case nil: + case ErrNotFound: + return true + default: + err = ferr + return false + } + + if fukey, fseq, fkt, fkerr := parseIkey(fikey); fkerr == nil { + if v.s.icmp.uCompare(ukey, fukey) == 0 { + if level == 0 { + if fseq >= zseq { + zfound = true + zseq = fseq + zkt = fkt + zval = fval } + } else { + switch fkt { + case ktVal: + value = fval + err = nil + case ktDel: + default: + panic("leveldb: invalid iKey type") + } + return false } - } else { - err = errors.New("leveldb: internal key corrupted") - return } + } else { + err = fkerr + return false } - if level == 0 && l0found { - switch l0type { - case tVal: - value = l0value - case tDel: - err = ErrNotFound + + return true + }, func(level int) bool { + if zfound { + switch zkt { + case ktVal: + value = zval + err = nil + case ktDel: default: - panic("invalid type") + panic("leveldb: invalid iKey type") } - return + return false } + + return true + }) + + if tseek && tset.table.consumeSeek() <= 0 { + tcomp = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset)) } - err = ErrNotFound return } -func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []iterator.Iterator) { - s := v.s +func (v *version) sampleSeek(ikey iKey) (tcomp bool) { + var tset *tSet + v.walkOverlapping(ikey, func(level int, t *tFile) bool { + if tset == nil { + tset = &tSet{level, t} + return true + } else { + if tset.table.consumeSeek() <= 0 { + tcomp = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset)) + } + return false + } + }, nil) + + return +} + +func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []iterator.Iterator) { // Merge all level zero files together since they may overlap for _, t := range v.tables[0] { - it := s.tops.newIterator(t, slice, ro) + it := v.s.tops.newIterator(t, slice, ro) its = append(its, it) } - strict := s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator) - for _, tt := range v.tables[1:] { - if len(tt) == 0 { + strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader) + for _, tables := range v.tables[1:] { + if len(tables) == 0 { continue } - it := iterator.NewIndexedIterator(tt.newIndexIterator(s.tops, s.icmp, slice, ro), strict, true) + it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict) its = append(its, it) } @@ -220,7 +248,7 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it } func (v *version) newStaging() *versionStaging { - return &versionStaging{base: v} + return &versionStaging{base: v, tables: make([]tablesScratch, v.s.o.GetNumLevel())} } // Spawn a new version based on this version. @@ -242,25 +270,25 @@ func (v *version) tLen(level int) int { return len(v.tables[level]) } -func (v *version) offsetOf(key iKey) (n uint64, err error) { - for level, tt := range v.tables { - for _, t := range tt { - if v.s.icmp.Compare(t.max, key) <= 0 { - // Entire file is before "key", so just add the file size +func (v *version) offsetOf(ikey iKey) (n uint64, err error) { + for level, tables := range v.tables { + for _, t := range tables { + if v.s.icmp.Compare(t.imax, ikey) <= 0 { + // Entire file is before "ikey", so just add the file size n += t.size - } else if v.s.icmp.Compare(t.min, key) > 0 { - // Entire file is after "key", so ignore + } else if v.s.icmp.Compare(t.imin, ikey) > 0 { + // Entire file is after "ikey", so ignore if level > 0 { // Files other than level 0 are sorted by meta->min, so // no further files in this level will contain data for - // "key". + // "ikey". break } } else { - // "key" falls in the range for this table. Add the - // approximate offset of "key" within the table. + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. var nn uint64 - nn, err = v.s.tops.offsetOf(t, key) + nn, err = v.s.tops.offsetOf(t, ikey) if err != nil { return 0, err } @@ -272,15 +300,16 @@ func (v *version) offsetOf(key iKey) (n uint64, err error) { return } -func (v *version) pickLevel(min, max []byte) (level int) { - if !v.tables[0].isOverlaps(min, max, false, v.s.icmp) { - var r tFiles - for ; level < kMaxMemCompactLevel; level++ { - if v.tables[level+1].isOverlaps(min, max, true, v.s.icmp) { +func (v *version) pickLevel(umin, umax []byte) (level int) { + if !v.tables[0].overlaps(v.s.icmp, umin, umax, true) { + var overlaps tFiles + maxLevel := v.s.o.GetMaxMemCompationLevel() + for ; level < maxLevel; level++ { + if v.tables[level+1].overlaps(v.s.icmp, umin, umax, false) { break } - v.tables[level+2].getOverlaps(min, max, &r, true, v.s.icmp.ucmp) - if r.size() > kMaxGrandParentOverlapBytes { + overlaps = v.tables[level+2].getOverlaps(overlaps, v.s.icmp, umin, umax, false) + if overlaps.size() > uint64(v.s.o.GetCompactionGPOverlaps(level)) { break } } @@ -294,7 +323,7 @@ func (v *version) computeCompaction() { var bestLevel int = -1 var bestScore float64 = -1 - for level, ff := range v.tables { + for level, tables := range v.tables { var score float64 if level == 0 { // We treat level-0 specially by bounding the number of files @@ -308,9 +337,9 @@ func (v *version) computeCompaction() { // file size is small (perhaps because of a small write-buffer // setting, or very high compression ratios, or lots of // overwrites/deletions). - score = float64(len(ff)) / kL0_CompactionTrigger + score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger()) } else { - score = float64(ff.size()) / levelMaxSize[level] + score = float64(tables.size()) / float64(v.s.o.GetCompactionTotalSize(level)) } if score > bestScore { @@ -327,66 +356,62 @@ func (v *version) needCompaction() bool { return v.cScore >= 1 || atomic.LoadPointer(&v.cSeek) != nil } +type tablesScratch struct { + added map[uint64]atRecord + deleted map[uint64]struct{} +} + type versionStaging struct { base *version - tables [kNumLevels]struct { - added map[uint64]ntRecord - deleted map[uint64]struct{} - } + tables []tablesScratch } func (p *versionStaging) commit(r *sessionRecord) { - btt := p.base.tables - - // deleted tables - for _, tr := range r.deletedTables { - tm := &(p.tables[tr.level]) + // Deleted tables. + for _, r := range r.deletedTables { + tm := &(p.tables[r.level]) - bt := btt[tr.level] - if len(bt) > 0 { + if len(p.base.tables[r.level]) > 0 { if tm.deleted == nil { tm.deleted = make(map[uint64]struct{}) } - tm.deleted[tr.num] = struct{}{} + tm.deleted[r.num] = struct{}{} } if tm.added != nil { - delete(tm.added, tr.num) + delete(tm.added, r.num) } } - // new tables - for _, tr := range r.addedTables { - tm := &(p.tables[tr.level]) + // New tables. + for _, r := range r.addedTables { + tm := &(p.tables[r.level]) if tm.added == nil { - tm.added = make(map[uint64]ntRecord) + tm.added = make(map[uint64]atRecord) } - tm.added[tr.num] = tr + tm.added[r.num] = r if tm.deleted != nil { - delete(tm.deleted, tr.num) + delete(tm.deleted, r.num) } } } func (p *versionStaging) finish() *version { - s := p.base.s - btt := p.base.tables - - // build new version - nv := &version{s: s} + // Build new version. + nv := newVersion(p.base.s) for level, tm := range p.tables { - bt := btt[level] + btables := p.base.tables[level] - n := len(bt) + len(tm.added) - len(tm.deleted) + n := len(btables) + len(tm.added) - len(tm.deleted) if n < 0 { n = 0 } nt := make(tFiles, 0, n) - // base tables - for _, t := range bt { + // Base tables. + for _, t := range btables { if _, ok := tm.deleted[t.file.Num()]; ok { continue } @@ -396,17 +421,21 @@ func (p *versionStaging) finish() *version { nt = append(nt, t) } - // new tables - for _, tr := range tm.added { - nt = append(nt, tr.makeFile(s)) + // New tables. + for _, r := range tm.added { + nt = append(nt, p.base.s.tableFileFromRecord(r)) } - // sort tables - nt.sortByKey(s.icmp) + // Sort tables. + if level == 0 { + nt.sortByNum() + } else { + nt.sortByKey(p.base.s.icmp) + } nv.tables[level] = nt } - // compute compaction score for new version + // Compute compaction score for new version. nv.computeCompaction() return nv @@ -421,7 +450,7 @@ func (vr *versionReleaser) Release() { v := vr.v v.s.vmu.Lock() if !vr.once { - v.release_NB() + v.releaseNB() vr.once = true } v.s.vmu.Unlock() diff --git a/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/decode.go b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/decode.go new file mode 100644 index 000000000..552a17bfb --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/decode.go @@ -0,0 +1,292 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "encoding/binary" + "errors" + "io" +) + +var ( + // ErrCorrupt reports that the input is invalid. + ErrCorrupt = errors.New("snappy: corrupt input") + // ErrUnsupported reports that the input isn't supported. + ErrUnsupported = errors.New("snappy: unsupported input") +) + +// DecodedLen returns the length of the decoded block. +func DecodedLen(src []byte) (int, error) { + v, _, err := decodedLen(src) + return v, err +} + +// decodedLen returns the length of the decoded block and the number of bytes +// that the length header occupied. +func decodedLen(src []byte) (blockLen, headerLen int, err error) { + v, n := binary.Uvarint(src) + if n == 0 { + return 0, 0, ErrCorrupt + } + if uint64(int(v)) != v { + return 0, 0, errors.New("snappy: decoded block is too large") + } + return int(v), n, nil +} + +// Decode returns the decoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire decoded block. +// Otherwise, a newly allocated slice will be returned. +// It is valid to pass a nil dst. +func Decode(dst, src []byte) ([]byte, error) { + dLen, s, err := decodedLen(src) + if err != nil { + return nil, err + } + if len(dst) < dLen { + dst = make([]byte, dLen) + } + + var d, offset, length int + for s < len(src) { + switch src[s] & 0x03 { + case tagLiteral: + x := uint(src[s] >> 2) + switch { + case x < 60: + s += 1 + case x == 60: + s += 2 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-1]) + case x == 61: + s += 3 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-2]) | uint(src[s-1])<<8 + case x == 62: + s += 4 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-3]) | uint(src[s-2])<<8 | uint(src[s-1])<<16 + case x == 63: + s += 5 + if s > len(src) { + return nil, ErrCorrupt + } + x = uint(src[s-4]) | uint(src[s-3])<<8 | uint(src[s-2])<<16 | uint(src[s-1])<<24 + } + length = int(x + 1) + if length <= 0 { + return nil, errors.New("snappy: unsupported literal length") + } + if length > len(dst)-d || length > len(src)-s { + return nil, ErrCorrupt + } + copy(dst[d:], src[s:s+length]) + d += length + s += length + continue + + case tagCopy1: + s += 2 + if s > len(src) { + return nil, ErrCorrupt + } + length = 4 + int(src[s-2])>>2&0x7 + offset = int(src[s-2])&0xe0<<3 | int(src[s-1]) + + case tagCopy2: + s += 3 + if s > len(src) { + return nil, ErrCorrupt + } + length = 1 + int(src[s-3])>>2 + offset = int(src[s-2]) | int(src[s-1])<<8 + + case tagCopy4: + return nil, errors.New("snappy: unsupported COPY_4 tag") + } + + end := d + length + if offset > d || end > len(dst) { + return nil, ErrCorrupt + } + for ; d < end; d++ { + dst[d] = dst[d-offset] + } + } + if d != dLen { + return nil, ErrCorrupt + } + return dst[:d], nil +} + +// NewReader returns a new Reader that decompresses from r, using the framing +// format described at +// https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt +func NewReader(r io.Reader) *Reader { + return &Reader{ + r: r, + decoded: make([]byte, maxUncompressedChunkLen), + buf: make([]byte, MaxEncodedLen(maxUncompressedChunkLen)+checksumSize), + } +} + +// Reader is an io.Reader than can read Snappy-compressed bytes. +type Reader struct { + r io.Reader + err error + decoded []byte + buf []byte + // decoded[i:j] contains decoded bytes that have not yet been passed on. + i, j int + readHeader bool +} + +// Reset discards any buffered data, resets all state, and switches the Snappy +// reader to read from r. This permits reusing a Reader rather than allocating +// a new one. +func (r *Reader) Reset(reader io.Reader) { + r.r = reader + r.err = nil + r.i = 0 + r.j = 0 + r.readHeader = false +} + +func (r *Reader) readFull(p []byte) (ok bool) { + if _, r.err = io.ReadFull(r.r, p); r.err != nil { + if r.err == io.ErrUnexpectedEOF { + r.err = ErrCorrupt + } + return false + } + return true +} + +// Read satisfies the io.Reader interface. +func (r *Reader) Read(p []byte) (int, error) { + if r.err != nil { + return 0, r.err + } + for { + if r.i < r.j { + n := copy(p, r.decoded[r.i:r.j]) + r.i += n + return n, nil + } + if !r.readFull(r.buf[:4]) { + return 0, r.err + } + chunkType := r.buf[0] + if !r.readHeader { + if chunkType != chunkTypeStreamIdentifier { + r.err = ErrCorrupt + return 0, r.err + } + r.readHeader = true + } + chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 + if chunkLen > len(r.buf) { + r.err = ErrUnsupported + return 0, r.err + } + + // The chunk types are specified at + // https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt + switch chunkType { + case chunkTypeCompressedData: + // Section 4.2. Compressed data (chunk type 0x00). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + buf := r.buf[:chunkLen] + if !r.readFull(buf) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + buf = buf[checksumSize:] + + n, err := DecodedLen(buf) + if err != nil { + r.err = err + return 0, r.err + } + if n > len(r.decoded) { + r.err = ErrCorrupt + return 0, r.err + } + if _, err := Decode(r.decoded, buf); err != nil { + r.err = err + return 0, r.err + } + if crc(r.decoded[:n]) != checksum { + r.err = ErrCorrupt + return 0, r.err + } + r.i, r.j = 0, n + continue + + case chunkTypeUncompressedData: + // Section 4.3. Uncompressed data (chunk type 0x01). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + buf := r.buf[:checksumSize] + if !r.readFull(buf) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + // Read directly into r.decoded instead of via r.buf. + n := chunkLen - checksumSize + if !r.readFull(r.decoded[:n]) { + return 0, r.err + } + if crc(r.decoded[:n]) != checksum { + r.err = ErrCorrupt + return 0, r.err + } + r.i, r.j = 0, n + continue + + case chunkTypeStreamIdentifier: + // Section 4.1. Stream identifier (chunk type 0xff). + if chunkLen != len(magicBody) { + r.err = ErrCorrupt + return 0, r.err + } + if !r.readFull(r.buf[:len(magicBody)]) { + return 0, r.err + } + for i := 0; i < len(magicBody); i++ { + if r.buf[i] != magicBody[i] { + r.err = ErrCorrupt + return 0, r.err + } + } + continue + } + + if chunkType <= 0x7f { + // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). + r.err = ErrUnsupported + return 0, r.err + + } else { + // Section 4.4 Padding (chunk type 0xfe). + // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). + if !r.readFull(r.buf[:chunkLen]) { + return 0, r.err + } + } + } +} diff --git a/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/encode.go b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/encode.go new file mode 100644 index 000000000..dda372422 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/encode.go @@ -0,0 +1,258 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "encoding/binary" + "io" +) + +// We limit how far copy back-references can go, the same as the C++ code. +const maxOffset = 1 << 15 + +// emitLiteral writes a literal chunk and returns the number of bytes written. +func emitLiteral(dst, lit []byte) int { + i, n := 0, uint(len(lit)-1) + switch { + case n < 60: + dst[0] = uint8(n)<<2 | tagLiteral + i = 1 + case n < 1<<8: + dst[0] = 60<<2 | tagLiteral + dst[1] = uint8(n) + i = 2 + case n < 1<<16: + dst[0] = 61<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + i = 3 + case n < 1<<24: + dst[0] = 62<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + dst[3] = uint8(n >> 16) + i = 4 + case int64(n) < 1<<32: + dst[0] = 63<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + dst[3] = uint8(n >> 16) + dst[4] = uint8(n >> 24) + i = 5 + default: + panic("snappy: source buffer is too long") + } + if copy(dst[i:], lit) != len(lit) { + panic("snappy: destination buffer is too short") + } + return i + len(lit) +} + +// emitCopy writes a copy chunk and returns the number of bytes written. +func emitCopy(dst []byte, offset, length int) int { + i := 0 + for length > 0 { + x := length - 4 + if 0 <= x && x < 1<<3 && offset < 1<<11 { + dst[i+0] = uint8(offset>>8)&0x07<<5 | uint8(x)<<2 | tagCopy1 + dst[i+1] = uint8(offset) + i += 2 + break + } + + x = length + if x > 1<<6 { + x = 1 << 6 + } + dst[i+0] = uint8(x-1)<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + i += 3 + length -= x + } + return i +} + +// Encode returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// It is valid to pass a nil dst. +func Encode(dst, src []byte) ([]byte, error) { + if n := MaxEncodedLen(len(src)); len(dst) < n { + dst = make([]byte, n) + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + // Return early if src is short. + if len(src) <= 4 { + if len(src) != 0 { + d += emitLiteral(dst[d:], src) + } + return dst[:d], nil + } + + // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. + const maxTableSize = 1 << 14 + shift, tableSize := uint(32-8), 1<<8 + for tableSize < maxTableSize && tableSize < len(src) { + shift-- + tableSize *= 2 + } + var table [maxTableSize]int + + // Iterate over the source bytes. + var ( + s int // The iterator position. + t int // The last position with the same hash as s. + lit int // The start position of any pending literal bytes. + ) + for s+3 < len(src) { + // Update the hash table. + b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3] + h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24 + p := &table[(h*0x1e35a7bd)>>shift] + // We need to to store values in [-1, inf) in table. To save + // some initialization time, (re)use the table's zero value + // and shift the values against this zero: add 1 on writes, + // subtract 1 on reads. + t, *p = *p-1, s+1 + // If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte. + if t < 0 || s-t >= maxOffset || b0 != src[t] || b1 != src[t+1] || b2 != src[t+2] || b3 != src[t+3] { + s++ + continue + } + // Otherwise, we have a match. First, emit any pending literal bytes. + if lit != s { + d += emitLiteral(dst[d:], src[lit:s]) + } + // Extend the match to be as long as possible. + s0 := s + s, t = s+4, t+4 + for s < len(src) && src[s] == src[t] { + s++ + t++ + } + // Emit the copied bytes. + d += emitCopy(dst[d:], s-t, s-s0) + lit = s + } + + // Emit any final pending literal bytes and return. + if lit != len(src) { + d += emitLiteral(dst[d:], src[lit:]) + } + return dst[:d], nil +} + +// MaxEncodedLen returns the maximum length of a snappy block, given its +// uncompressed length. +func MaxEncodedLen(srcLen int) int { + // Compressed data can be defined as: + // compressed := item* literal* + // item := literal* copy + // + // The trailing literal sequence has a space blowup of at most 62/60 + // since a literal of length 60 needs one tag byte + one extra byte + // for length information. + // + // Item blowup is trickier to measure. Suppose the "copy" op copies + // 4 bytes of data. Because of a special check in the encoding code, + // we produce a 4-byte copy only if the offset is < 65536. Therefore + // the copy op takes 3 bytes to encode, and this type of item leads + // to at most the 62/60 blowup for representing literals. + // + // Suppose the "copy" op copies 5 bytes of data. If the offset is big + // enough, it will take 5 bytes to encode the copy op. Therefore the + // worst case here is a one-byte literal followed by a five-byte copy. + // That is, 6 bytes of input turn into 7 bytes of "compressed" data. + // + // This last factor dominates the blowup, so the final estimate is: + return 32 + srcLen + srcLen/6 +} + +// NewWriter returns a new Writer that compresses to w, using the framing +// format described at +// https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt +func NewWriter(w io.Writer) *Writer { + return &Writer{ + w: w, + enc: make([]byte, MaxEncodedLen(maxUncompressedChunkLen)), + } +} + +// Writer is an io.Writer than can write Snappy-compressed bytes. +type Writer struct { + w io.Writer + err error + enc []byte + buf [checksumSize + chunkHeaderSize]byte + wroteHeader bool +} + +// Reset discards the writer's state and switches the Snappy writer to write to +// w. This permits reusing a Writer rather than allocating a new one. +func (w *Writer) Reset(writer io.Writer) { + w.w = writer + w.err = nil + w.wroteHeader = false +} + +// Write satisfies the io.Writer interface. +func (w *Writer) Write(p []byte) (n int, errRet error) { + if w.err != nil { + return 0, w.err + } + if !w.wroteHeader { + copy(w.enc, magicChunk) + if _, err := w.w.Write(w.enc[:len(magicChunk)]); err != nil { + w.err = err + return n, err + } + w.wroteHeader = true + } + for len(p) > 0 { + var uncompressed []byte + if len(p) > maxUncompressedChunkLen { + uncompressed, p = p[:maxUncompressedChunkLen], p[maxUncompressedChunkLen:] + } else { + uncompressed, p = p, nil + } + checksum := crc(uncompressed) + + // Compress the buffer, discarding the result if the improvement + // isn't at least 12.5%. + chunkType := uint8(chunkTypeCompressedData) + chunkBody, err := Encode(w.enc, uncompressed) + if err != nil { + w.err = err + return n, err + } + if len(chunkBody) >= len(uncompressed)-len(uncompressed)/8 { + chunkType, chunkBody = chunkTypeUncompressedData, uncompressed + } + + chunkLen := 4 + len(chunkBody) + w.buf[0] = chunkType + w.buf[1] = uint8(chunkLen >> 0) + w.buf[2] = uint8(chunkLen >> 8) + w.buf[3] = uint8(chunkLen >> 16) + w.buf[4] = uint8(checksum >> 0) + w.buf[5] = uint8(checksum >> 8) + w.buf[6] = uint8(checksum >> 16) + w.buf[7] = uint8(checksum >> 24) + if _, err = w.w.Write(w.buf[:]); err != nil { + w.err = err + return n, err + } + if _, err = w.w.Write(chunkBody); err != nil { + w.err = err + return n, err + } + n += len(uncompressed) + } + return n, nil +} diff --git a/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy.go b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy.go new file mode 100644 index 000000000..043bf3d81 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy.go @@ -0,0 +1,68 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package snappy implements the snappy block-based compression format. +// It aims for very high speeds and reasonable compression. +// +// The C++ snappy implementation is at http://code.google.com/p/snappy/ +package snappy + +import ( + "hash/crc32" +) + +/* +Each encoded block begins with the varint-encoded length of the decoded data, +followed by a sequence of chunks. Chunks begin and end on byte boundaries. The +first byte of each chunk is broken into its 2 least and 6 most significant bits +called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. +Zero means a literal tag. All other values mean a copy tag. + +For literal tags: + - If m < 60, the next 1 + m bytes are literal bytes. + - Otherwise, let n be the little-endian unsigned integer denoted by the next + m - 59 bytes. The next 1 + n bytes after that are literal bytes. + +For copy tags, length bytes are copied from offset bytes ago, in the style of +Lempel-Ziv compression algorithms. In particular: + - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). + The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 + of the offset. The next byte is bits 0-7 of the offset. + - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). + The length is 1 + m. The offset is the little-endian unsigned integer + denoted by the next 2 bytes. + - For l == 3, this tag is a legacy format that is no longer supported. +*/ +const ( + tagLiteral = 0x00 + tagCopy1 = 0x01 + tagCopy2 = 0x02 + tagCopy4 = 0x03 +) + +const ( + checksumSize = 4 + chunkHeaderSize = 4 + magicChunk = "\xff\x06\x00\x00" + magicBody + magicBody = "sNaPpY" + // https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt says + // that "the uncompressed data in a chunk must be no longer than 65536 bytes". + maxUncompressedChunkLen = 65536 +) + +const ( + chunkTypeCompressedData = 0x00 + chunkTypeUncompressedData = 0x01 + chunkTypePadding = 0xfe + chunkTypeStreamIdentifier = 0xff +) + +var crcTable = crc32.MakeTable(crc32.Castagnoli) + +// crc implements the checksum specified in section 3 of +// https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt +func crc(b []byte) uint32 { + c := crc32.Update(0, crcTable, b) + return uint32(c>>15|c<<17) + 0xa282ead8 +} diff --git a/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy_test.go b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy_test.go new file mode 100644 index 000000000..0623385b7 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy_test.go @@ -0,0 +1,364 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "bytes" + "flag" + "fmt" + "io" + "io/ioutil" + "math/rand" + "net/http" + "os" + "path/filepath" + "strings" + "testing" +) + +var ( + download = flag.Bool("download", false, "If true, download any missing files before running benchmarks") + testdata = flag.String("testdata", "testdata", "Directory containing the test data") +) + +func roundtrip(b, ebuf, dbuf []byte) error { + e, err := Encode(ebuf, b) + if err != nil { + return fmt.Errorf("encoding error: %v", err) + } + d, err := Decode(dbuf, e) + if err != nil { + return fmt.Errorf("decoding error: %v", err) + } + if !bytes.Equal(b, d) { + return fmt.Errorf("roundtrip mismatch:\n\twant %v\n\tgot %v", b, d) + } + return nil +} + +func TestEmpty(t *testing.T) { + if err := roundtrip(nil, nil, nil); err != nil { + t.Fatal(err) + } +} + +func TestSmallCopy(t *testing.T) { + for _, ebuf := range [][]byte{nil, make([]byte, 20), make([]byte, 64)} { + for _, dbuf := range [][]byte{nil, make([]byte, 20), make([]byte, 64)} { + for i := 0; i < 32; i++ { + s := "aaaa" + strings.Repeat("b", i) + "aaaabbbb" + if err := roundtrip([]byte(s), ebuf, dbuf); err != nil { + t.Errorf("len(ebuf)=%d, len(dbuf)=%d, i=%d: %v", len(ebuf), len(dbuf), i, err) + } + } + } + } +} + +func TestSmallRand(t *testing.T) { + rng := rand.New(rand.NewSource(27354294)) + for n := 1; n < 20000; n += 23 { + b := make([]byte, n) + for i := range b { + b[i] = uint8(rng.Uint32()) + } + if err := roundtrip(b, nil, nil); err != nil { + t.Fatal(err) + } + } +} + +func TestSmallRegular(t *testing.T) { + for n := 1; n < 20000; n += 23 { + b := make([]byte, n) + for i := range b { + b[i] = uint8(i%10 + 'a') + } + if err := roundtrip(b, nil, nil); err != nil { + t.Fatal(err) + } + } +} + +func cmp(a, b []byte) error { + if len(a) != len(b) { + return fmt.Errorf("got %d bytes, want %d", len(a), len(b)) + } + for i := range a { + if a[i] != b[i] { + return fmt.Errorf("byte #%d: got 0x%02x, want 0x%02x", i, a[i], b[i]) + } + } + return nil +} + +func TestFramingFormat(t *testing.T) { + // src is comprised of alternating 1e5-sized sequences of random + // (incompressible) bytes and repeated (compressible) bytes. 1e5 was chosen + // because it is larger than maxUncompressedChunkLen (64k). + src := make([]byte, 1e6) + rng := rand.New(rand.NewSource(1)) + for i := 0; i < 10; i++ { + if i%2 == 0 { + for j := 0; j < 1e5; j++ { + src[1e5*i+j] = uint8(rng.Intn(256)) + } + } else { + for j := 0; j < 1e5; j++ { + src[1e5*i+j] = uint8(i) + } + } + } + + buf := new(bytes.Buffer) + if _, err := NewWriter(buf).Write(src); err != nil { + t.Fatalf("Write: encoding: %v", err) + } + dst, err := ioutil.ReadAll(NewReader(buf)) + if err != nil { + t.Fatalf("ReadAll: decoding: %v", err) + } + if err := cmp(dst, src); err != nil { + t.Fatal(err) + } +} + +func TestReaderReset(t *testing.T) { + gold := bytes.Repeat([]byte("All that is gold does not glitter,\n"), 10000) + buf := new(bytes.Buffer) + if _, err := NewWriter(buf).Write(gold); err != nil { + t.Fatalf("Write: %v", err) + } + encoded, invalid, partial := buf.String(), "invalid", "partial" + r := NewReader(nil) + for i, s := range []string{encoded, invalid, partial, encoded, partial, invalid, encoded, encoded} { + if s == partial { + r.Reset(strings.NewReader(encoded)) + if _, err := r.Read(make([]byte, 101)); err != nil { + t.Errorf("#%d: %v", i, err) + continue + } + continue + } + r.Reset(strings.NewReader(s)) + got, err := ioutil.ReadAll(r) + switch s { + case encoded: + if err != nil { + t.Errorf("#%d: %v", i, err) + continue + } + if err := cmp(got, gold); err != nil { + t.Errorf("#%d: %v", i, err) + continue + } + case invalid: + if err == nil { + t.Errorf("#%d: got nil error, want non-nil", i) + continue + } + } + } +} + +func TestWriterReset(t *testing.T) { + gold := bytes.Repeat([]byte("Not all those who wander are lost;\n"), 10000) + var gots, wants [][]byte + const n = 20 + w, failed := NewWriter(nil), false + for i := 0; i <= n; i++ { + buf := new(bytes.Buffer) + w.Reset(buf) + want := gold[:len(gold)*i/n] + if _, err := w.Write(want); err != nil { + t.Errorf("#%d: Write: %v", i, err) + failed = true + continue + } + got, err := ioutil.ReadAll(NewReader(buf)) + if err != nil { + t.Errorf("#%d: ReadAll: %v", i, err) + failed = true + continue + } + gots = append(gots, got) + wants = append(wants, want) + } + if failed { + return + } + for i := range gots { + if err := cmp(gots[i], wants[i]); err != nil { + t.Errorf("#%d: %v", i, err) + } + } +} + +func benchDecode(b *testing.B, src []byte) { + encoded, err := Encode(nil, src) + if err != nil { + b.Fatal(err) + } + // Bandwidth is in amount of uncompressed data. + b.SetBytes(int64(len(src))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + Decode(src, encoded) + } +} + +func benchEncode(b *testing.B, src []byte) { + // Bandwidth is in amount of uncompressed data. + b.SetBytes(int64(len(src))) + dst := make([]byte, MaxEncodedLen(len(src))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + Encode(dst, src) + } +} + +func readFile(b testing.TB, filename string) []byte { + src, err := ioutil.ReadFile(filename) + if err != nil { + b.Fatalf("failed reading %s: %s", filename, err) + } + if len(src) == 0 { + b.Fatalf("%s has zero length", filename) + } + return src +} + +// expand returns a slice of length n containing repeated copies of src. +func expand(src []byte, n int) []byte { + dst := make([]byte, n) + for x := dst; len(x) > 0; { + i := copy(x, src) + x = x[i:] + } + return dst +} + +func benchWords(b *testing.B, n int, decode bool) { + // Note: the file is OS-language dependent so the resulting values are not + // directly comparable for non-US-English OS installations. + data := expand(readFile(b, "/usr/share/dict/words"), n) + if decode { + benchDecode(b, data) + } else { + benchEncode(b, data) + } +} + +func BenchmarkWordsDecode1e3(b *testing.B) { benchWords(b, 1e3, true) } +func BenchmarkWordsDecode1e4(b *testing.B) { benchWords(b, 1e4, true) } +func BenchmarkWordsDecode1e5(b *testing.B) { benchWords(b, 1e5, true) } +func BenchmarkWordsDecode1e6(b *testing.B) { benchWords(b, 1e6, true) } +func BenchmarkWordsEncode1e3(b *testing.B) { benchWords(b, 1e3, false) } +func BenchmarkWordsEncode1e4(b *testing.B) { benchWords(b, 1e4, false) } +func BenchmarkWordsEncode1e5(b *testing.B) { benchWords(b, 1e5, false) } +func BenchmarkWordsEncode1e6(b *testing.B) { benchWords(b, 1e6, false) } + +// testFiles' values are copied directly from +// https://raw.githubusercontent.com/google/snappy/master/snappy_unittest.cc +// The label field is unused in snappy-go. +var testFiles = []struct { + label string + filename string +}{ + {"html", "html"}, + {"urls", "urls.10K"}, + {"jpg", "fireworks.jpeg"}, + {"jpg_200", "fireworks.jpeg"}, + {"pdf", "paper-100k.pdf"}, + {"html4", "html_x_4"}, + {"txt1", "alice29.txt"}, + {"txt2", "asyoulik.txt"}, + {"txt3", "lcet10.txt"}, + {"txt4", "plrabn12.txt"}, + {"pb", "geo.protodata"}, + {"gaviota", "kppkn.gtb"}, +} + +// The test data files are present at this canonical URL. +const baseURL = "https://raw.githubusercontent.com/google/snappy/master/testdata/" + +func downloadTestdata(basename string) (errRet error) { + filename := filepath.Join(*testdata, basename) + if stat, err := os.Stat(filename); err == nil && stat.Size() != 0 { + return nil + } + + if !*download { + return fmt.Errorf("test data not found; skipping benchmark without the -download flag") + } + // Download the official snappy C++ implementation reference test data + // files for benchmarking. + if err := os.Mkdir(*testdata, 0777); err != nil && !os.IsExist(err) { + return fmt.Errorf("failed to create testdata: %s", err) + } + + f, err := os.Create(filename) + if err != nil { + return fmt.Errorf("failed to create %s: %s", filename, err) + } + defer f.Close() + defer func() { + if errRet != nil { + os.Remove(filename) + } + }() + url := baseURL + basename + resp, err := http.Get(url) + if err != nil { + return fmt.Errorf("failed to download %s: %s", url, err) + } + defer resp.Body.Close() + if s := resp.StatusCode; s != http.StatusOK { + return fmt.Errorf("downloading %s: HTTP status code %d (%s)", url, s, http.StatusText(s)) + } + _, err = io.Copy(f, resp.Body) + if err != nil { + return fmt.Errorf("failed to download %s to %s: %s", url, filename, err) + } + return nil +} + +func benchFile(b *testing.B, n int, decode bool) { + if err := downloadTestdata(testFiles[n].filename); err != nil { + b.Fatalf("failed to download testdata: %s", err) + } + data := readFile(b, filepath.Join(*testdata, testFiles[n].filename)) + if decode { + benchDecode(b, data) + } else { + benchEncode(b, data) + } +} + +// Naming convention is kept similar to what snappy's C++ implementation uses. +func Benchmark_UFlat0(b *testing.B) { benchFile(b, 0, true) } +func Benchmark_UFlat1(b *testing.B) { benchFile(b, 1, true) } +func Benchmark_UFlat2(b *testing.B) { benchFile(b, 2, true) } +func Benchmark_UFlat3(b *testing.B) { benchFile(b, 3, true) } +func Benchmark_UFlat4(b *testing.B) { benchFile(b, 4, true) } +func Benchmark_UFlat5(b *testing.B) { benchFile(b, 5, true) } +func Benchmark_UFlat6(b *testing.B) { benchFile(b, 6, true) } +func Benchmark_UFlat7(b *testing.B) { benchFile(b, 7, true) } +func Benchmark_UFlat8(b *testing.B) { benchFile(b, 8, true) } +func Benchmark_UFlat9(b *testing.B) { benchFile(b, 9, true) } +func Benchmark_UFlat10(b *testing.B) { benchFile(b, 10, true) } +func Benchmark_UFlat11(b *testing.B) { benchFile(b, 11, true) } +func Benchmark_ZFlat0(b *testing.B) { benchFile(b, 0, false) } +func Benchmark_ZFlat1(b *testing.B) { benchFile(b, 1, false) } +func Benchmark_ZFlat2(b *testing.B) { benchFile(b, 2, false) } +func Benchmark_ZFlat3(b *testing.B) { benchFile(b, 3, false) } +func Benchmark_ZFlat4(b *testing.B) { benchFile(b, 4, false) } +func Benchmark_ZFlat5(b *testing.B) { benchFile(b, 5, false) } +func Benchmark_ZFlat6(b *testing.B) { benchFile(b, 6, false) } +func Benchmark_ZFlat7(b *testing.B) { benchFile(b, 7, false) } +func Benchmark_ZFlat8(b *testing.B) { benchFile(b, 8, false) } +func Benchmark_ZFlat9(b *testing.B) { benchFile(b, 9, false) } +func Benchmark_ZFlat10(b *testing.B) { benchFile(b, 10, false) } +func Benchmark_ZFlat11(b *testing.B) { benchFile(b, 11, false) } -- cgit v1.2.3