diff options
author | Péter Szilágyi <peterke@gmail.com> | 2019-05-13 20:28:01 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-05-13 20:28:01 +0800 |
commit | 9effd642901e13765dcc1396392ba55a18f66ccf (patch) | |
tree | 57355cff7ea6c5efbb5702c6e62ec7b698d4ff15 /trie/sync_bloom.go | |
parent | 40cdcf8c47ff094775aca08fd5d94051f9cf1dbb (diff) | |
download | go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.gz go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.bz2 go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.lz go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.xz go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.tar.zst go-tangerine-9effd642901e13765dcc1396392ba55a18f66ccf.zip |
core, eth, trie: bloom filter for trie node dedup during fast sync (#19489)
* core, eth, trie: bloom filter for trie node dedup during fast sync
* eth/downloader, trie: address review comments
* core, ethdb, trie: restart fast-sync bloom construction now and again
* eth/downloader: initialize fast sync bloom on startup
* eth: reenable eth/62 until we properly remove it
Diffstat (limited to 'trie/sync_bloom.go')
-rw-r--r-- | trie/sync_bloom.go | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/trie/sync_bloom.go b/trie/sync_bloom.go new file mode 100644 index 000000000..899a63add --- /dev/null +++ b/trie/sync_bloom.go @@ -0,0 +1,207 @@ +// Copyright 2019 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. + +package trie + +import ( + "encoding/binary" + "fmt" + "math" + "sync" + "sync/atomic" + "time" + + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/ethdb" + "github.com/ethereum/go-ethereum/log" + "github.com/ethereum/go-ethereum/metrics" + "github.com/steakknife/bloomfilter" +) + +var ( + bloomAddMeter = metrics.NewRegisteredMeter("trie/bloom/add", nil) + bloomLoadMeter = metrics.NewRegisteredMeter("trie/bloom/load", nil) + bloomTestMeter = metrics.NewRegisteredMeter("trie/bloom/test", nil) + bloomMissMeter = metrics.NewRegisteredMeter("trie/bloom/miss", nil) + bloomFaultMeter = metrics.NewRegisteredMeter("trie/bloom/fault", nil) + bloomErrorGauge = metrics.NewRegisteredGauge("trie/bloom/error", nil) +) + +// syncBloomHasher is a wrapper around a byte blob to satisfy the interface API +// requirements of the bloom library used. It's used to convert a trie hash into +// a 64 bit mini hash. +type syncBloomHasher []byte + +func (f syncBloomHasher) Write(p []byte) (n int, err error) { panic("not implemented") } +func (f syncBloomHasher) Sum(b []byte) []byte { panic("not implemented") } +func (f syncBloomHasher) Reset() { panic("not implemented") } +func (f syncBloomHasher) BlockSize() int { panic("not implemented") } +func (f syncBloomHasher) Size() int { return 8 } +func (f syncBloomHasher) Sum64() uint64 { return binary.BigEndian.Uint64(f) } + +// SyncBloom is a bloom filter used during fast sync to quickly decide if a trie +// node already exists on disk or not. It self populates from the provided disk +// database on creation in a background thread and will only start returning live +// results once that's finished. +type SyncBloom struct { + bloom *bloomfilter.Filter + inited uint32 + closer sync.Once + closed uint32 + pend sync.WaitGroup +} + +// NewSyncBloom creates a new bloom filter of the given size (in megabytes) and +// initializes it from the database. The bloom is hard coded to use 3 filters. +func NewSyncBloom(memory uint64, database ethdb.Iteratee) *SyncBloom { + // Create the bloom filter to track known trie nodes + bloom, err := bloomfilter.New(memory*1024*1024*8, 3) + if err != nil { + panic(fmt.Sprintf("failed to create bloom: %v", err)) // Can't happen, here for sanity + } + log.Info("Allocated fast sync bloom", "size", common.StorageSize(memory*1024*1024)) + + // Assemble the fast sync bloom and init it from previous sessions + b := &SyncBloom{ + bloom: bloom, + } + b.pend.Add(2) + go func() { + defer b.pend.Done() + b.init(database) + }() + go func() { + defer b.pend.Done() + b.meter() + }() + return b +} + +// init iterates over the database, pushing every trie hash into the bloom filter. +func (b *SyncBloom) init(database ethdb.Iteratee) { + // Iterate over the database, but restart every now and again to avoid holding + // a persistent snapshot since fast sync can push a ton of data concurrently, + // bloating the disk. + // + // Note, this is fine, because everything inserted into leveldb by fast sync is + // also pushed into the bloom directly, so we're not missing anything when the + // iterator is swapped out for a new one. + it := database.NewIterator() + + var ( + start = time.Now() + swap = time.Now() + ) + for it.Next() && atomic.LoadUint32(&b.closed) == 0 { + // If the database entry is a trie node, add it to the bloom + if key := it.Key(); len(key) == common.HashLength { + b.bloom.Add(syncBloomHasher(key)) + bloomLoadMeter.Mark(1) + } + // If enough time elapsed since the last iterator swap, restart + if time.Since(swap) > 8*time.Second { + key := common.CopyBytes(it.Key()) + + it.Release() + it = database.NewIteratorWithStart(key) + + log.Info("Initializing fast sync bloom", "items", b.bloom.N(), "errorrate", b.errorRate(), "elapsed", time.Since(start)) + swap = time.Now() + } + } + it.Release() + + // Mark the bloom filter inited and return + log.Info("Initialized fast sync bloom", "items", b.bloom.N(), "errorrate", b.errorRate(), "elapsed", time.Since(start)) + atomic.StoreUint32(&b.inited, 1) +} + +// meter periodically recalculates the false positive error rate of the bloom +// filter and reports it in a metric. +func (b *SyncBloom) meter() { + for { + // Report the current error ration. No floats, lame, scale it up. + bloomErrorGauge.Update(int64(b.errorRate() * 100000)) + + // Wait one second, but check termination more frequently + for i := 0; i < 10; i++ { + if atomic.LoadUint32(&b.closed) == 1 { + return + } + time.Sleep(100 * time.Millisecond) + } + } +} + +// Close terminates any background initializer still running and releases all the +// memory allocated for the bloom. +func (b *SyncBloom) Close() error { + b.closer.Do(func() { + // Ensure the initializer is stopped + atomic.StoreUint32(&b.closed, 1) + b.pend.Wait() + + // Wipe the bloom, but mark it "uninited" just in case someone attempts an access + log.Info("Deallocated fast sync bloom", "items", b.bloom.N(), "errorrate", b.errorRate()) + + atomic.StoreUint32(&b.inited, 0) + b.bloom = nil + }) + return nil +} + +// Add inserts a new trie node hash into the bloom filter. +func (b *SyncBloom) Add(hash []byte) { + if atomic.LoadUint32(&b.closed) == 1 { + return + } + b.bloom.Add(syncBloomHasher(hash)) + bloomAddMeter.Mark(1) +} + +// Contains tests if the bloom filter contains the given hash: +// - false: the bloom definitely does not contain hash +// - true: the bloom maybe contains hash +// +// While the bloom is being initialized, any query will return true. +func (b *SyncBloom) Contains(hash []byte) bool { + bloomTestMeter.Mark(1) + if atomic.LoadUint32(&b.inited) == 0 { + // We didn't load all the trie nodes from the previous run of Geth yet. As + // such, we can't say for sure if a hash is not present for anything. Until + // the init is done, we're faking "possible presence" for everything. + return true + } + // Bloom initialized, check the real one and report any successful misses + maybe := b.bloom.Contains(syncBloomHasher(hash)) + if !maybe { + bloomMissMeter.Mark(1) + } + return maybe +} + +// errorRate calculates the probability of a random containment test returning a +// false positive. +// +// We're calculating it ourselves because the bloom library we used missed a +// parentheses in the formula and calculates it wrong. And it's discontinued... +func (b *SyncBloom) errorRate() float64 { + k := float64(b.bloom.K()) + n := float64(b.bloom.N()) + m := float64(b.bloom.M()) + + return math.Pow(1.0-math.Exp((-k)*(n+0.5)/(m-1)), k) +} |