aboutsummaryrefslogtreecommitdiffstats
path: root/trie/sync_bloom.go
diff options
context:
space:
mode:
authorPéter Szilágyi <peterke@gmail.com>2019-05-13 20:28:01 +0800
committerGitHub <noreply@github.com>2019-05-13 20:28:01 +0800
commit9effd642901e13765dcc1396392ba55a18f66ccf (patch)
tree57355cff7ea6c5efbb5702c6e62ec7b698d4ff15 /trie/sync_bloom.go
parent40cdcf8c47ff094775aca08fd5d94051f9cf1dbb (diff)
downloadgo-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar
go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.gz
go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.bz2
go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.lz
go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.xz
go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.zst
go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.zip
core, eth, trie: bloom filter for trie node dedup during fast sync (#19489)
* core, eth, trie: bloom filter for trie node dedup during fast sync * eth/downloader, trie: address review comments * core, ethdb, trie: restart fast-sync bloom construction now and again * eth/downloader: initialize fast sync bloom on startup * eth: reenable eth/62 until we properly remove it
Diffstat (limited to 'trie/sync_bloom.go')
-rw-r--r--trie/sync_bloom.go207
1 files changed, 207 insertions, 0 deletions
diff --git a/trie/sync_bloom.go b/trie/sync_bloom.go
new file mode 100644
index 000000000..899a63add
--- /dev/null
+++ b/trie/sync_bloom.go
@@ -0,0 +1,207 @@
+// Copyright 2019 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+package trie
+
+import (
+ "encoding/binary"
+ "fmt"
+ "math"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ "github.com/ethereum/go-ethereum/common"
+ "github.com/ethereum/go-ethereum/ethdb"
+ "github.com/ethereum/go-ethereum/log"
+ "github.com/ethereum/go-ethereum/metrics"
+ "github.com/steakknife/bloomfilter"
+)
+
+var (
+ bloomAddMeter = metrics.NewRegisteredMeter("trie/bloom/add", nil)
+ bloomLoadMeter = metrics.NewRegisteredMeter("trie/bloom/load", nil)
+ bloomTestMeter = metrics.NewRegisteredMeter("trie/bloom/test", nil)
+ bloomMissMeter = metrics.NewRegisteredMeter("trie/bloom/miss", nil)
+ bloomFaultMeter = metrics.NewRegisteredMeter("trie/bloom/fault", nil)
+ bloomErrorGauge = metrics.NewRegisteredGauge("trie/bloom/error", nil)
+)
+
+// syncBloomHasher is a wrapper around a byte blob to satisfy the interface API
+// requirements of the bloom library used. It's used to convert a trie hash into
+// a 64 bit mini hash.
+type syncBloomHasher []byte
+
+func (f syncBloomHasher) Write(p []byte) (n int, err error) { panic("not implemented") }
+func (f syncBloomHasher) Sum(b []byte) []byte { panic("not implemented") }
+func (f syncBloomHasher) Reset() { panic("not implemented") }
+func (f syncBloomHasher) BlockSize() int { panic("not implemented") }
+func (f syncBloomHasher) Size() int { return 8 }
+func (f syncBloomHasher) Sum64() uint64 { return binary.BigEndian.Uint64(f) }
+
+// SyncBloom is a bloom filter used during fast sync to quickly decide if a trie
+// node already exists on disk or not. It self populates from the provided disk
+// database on creation in a background thread and will only start returning live
+// results once that's finished.
+type SyncBloom struct {
+ bloom *bloomfilter.Filter
+ inited uint32
+ closer sync.Once
+ closed uint32
+ pend sync.WaitGroup
+}
+
+// NewSyncBloom creates a new bloom filter of the given size (in megabytes) and
+// initializes it from the database. The bloom is hard coded to use 3 filters.
+func NewSyncBloom(memory uint64, database ethdb.Iteratee) *SyncBloom {
+ // Create the bloom filter to track known trie nodes
+ bloom, err := bloomfilter.New(memory*1024*1024*8, 3)
+ if err != nil {
+ panic(fmt.Sprintf("failed to create bloom: %v", err)) // Can't happen, here for sanity
+ }
+ log.Info("Allocated fast sync bloom", "size", common.StorageSize(memory*1024*1024))
+
+ // Assemble the fast sync bloom and init it from previous sessions
+ b := &SyncBloom{
+ bloom: bloom,
+ }
+ b.pend.Add(2)
+ go func() {
+ defer b.pend.Done()
+ b.init(database)
+ }()
+ go func() {
+ defer b.pend.Done()
+ b.meter()
+ }()
+ return b
+}
+
+// init iterates over the database, pushing every trie hash into the bloom filter.
+func (b *SyncBloom) init(database ethdb.Iteratee) {
+ // Iterate over the database, but restart every now and again to avoid holding
+ // a persistent snapshot since fast sync can push a ton of data concurrently,
+ // bloating the disk.
+ //
+ // Note, this is fine, because everything inserted into leveldb by fast sync is
+ // also pushed into the bloom directly, so we're not missing anything when the
+ // iterator is swapped out for a new one.
+ it := database.NewIterator()
+
+ var (
+ start = time.Now()
+ swap = time.Now()
+ )
+ for it.Next() && atomic.LoadUint32(&b.closed) == 0 {
+ // If the database entry is a trie node, add it to the bloom
+ if key := it.Key(); len(key) == common.HashLength {
+ b.bloom.Add(syncBloomHasher(key))
+ bloomLoadMeter.Mark(1)
+ }
+ // If enough time elapsed since the last iterator swap, restart
+ if time.Since(swap) > 8*time.Second {
+ key := common.CopyBytes(it.Key())
+
+ it.Release()
+ it = database.NewIteratorWithStart(key)
+
+ log.Info("Initializing fast sync bloom", "items", b.bloom.N(), "errorrate", b.errorRate(), "elapsed", time.Since(start))
+ swap = time.Now()
+ }
+ }
+ it.Release()
+
+ // Mark the bloom filter inited and return
+ log.Info("Initialized fast sync bloom", "items", b.bloom.N(), "errorrate", b.errorRate(), "elapsed", time.Since(start))
+ atomic.StoreUint32(&b.inited, 1)
+}
+
+// meter periodically recalculates the false positive error rate of the bloom
+// filter and reports it in a metric.
+func (b *SyncBloom) meter() {
+ for {
+ // Report the current error ration. No floats, lame, scale it up.
+ bloomErrorGauge.Update(int64(b.errorRate() * 100000))
+
+ // Wait one second, but check termination more frequently
+ for i := 0; i < 10; i++ {
+ if atomic.LoadUint32(&b.closed) == 1 {
+ return
+ }
+ time.Sleep(100 * time.Millisecond)
+ }
+ }
+}
+
+// Close terminates any background initializer still running and releases all the
+// memory allocated for the bloom.
+func (b *SyncBloom) Close() error {
+ b.closer.Do(func() {
+ // Ensure the initializer is stopped
+ atomic.StoreUint32(&b.closed, 1)
+ b.pend.Wait()
+
+ // Wipe the bloom, but mark it "uninited" just in case someone attempts an access
+ log.Info("Deallocated fast sync bloom", "items", b.bloom.N(), "errorrate", b.errorRate())
+
+ atomic.StoreUint32(&b.inited, 0)
+ b.bloom = nil
+ })
+ return nil
+}
+
+// Add inserts a new trie node hash into the bloom filter.
+func (b *SyncBloom) Add(hash []byte) {
+ if atomic.LoadUint32(&b.closed) == 1 {
+ return
+ }
+ b.bloom.Add(syncBloomHasher(hash))
+ bloomAddMeter.Mark(1)
+}
+
+// Contains tests if the bloom filter contains the given hash:
+// - false: the bloom definitely does not contain hash
+// - true: the bloom maybe contains hash
+//
+// While the bloom is being initialized, any query will return true.
+func (b *SyncBloom) Contains(hash []byte) bool {
+ bloomTestMeter.Mark(1)
+ if atomic.LoadUint32(&b.inited) == 0 {
+ // We didn't load all the trie nodes from the previous run of Geth yet. As
+ // such, we can't say for sure if a hash is not present for anything. Until
+ // the init is done, we're faking "possible presence" for everything.
+ return true
+ }
+ // Bloom initialized, check the real one and report any successful misses
+ maybe := b.bloom.Contains(syncBloomHasher(hash))
+ if !maybe {
+ bloomMissMeter.Mark(1)
+ }
+ return maybe
+}
+
+// errorRate calculates the probability of a random containment test returning a
+// false positive.
+//
+// We're calculating it ourselves because the bloom library we used missed a
+// parentheses in the formula and calculates it wrong. And it's discontinued...
+func (b *SyncBloom) errorRate() float64 {
+ k := float64(b.bloom.K())
+ n := float64(b.bloom.N())
+ m := float64(b.bloom.M())
+
+ return math.Pow(1.0-math.Exp((-k)*(n+0.5)/(m-1)), k)
+}