From 55599ee95d4151a2502465e0afc7c47bd1acba77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Mon, 5 Feb 2018 18:40:32 +0200 Subject: core, trie: intermediate mempool between trie and database (#15857) This commit reduces database I/O by not writing every state trie to disk. --- trie/database.go | 355 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 355 insertions(+) create mode 100644 trie/database.go (limited to 'trie/database.go') diff --git a/trie/database.go b/trie/database.go new file mode 100644 index 000000000..d79120813 --- /dev/null +++ b/trie/database.go @@ -0,0 +1,355 @@ +// Copyright 2017 The go-ethereum Authors +// This file is part of the go-ethereum library. +// +// The go-ethereum library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The go-ethereum library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the go-ethereum library. If not, see . + +package trie + +import ( + "sync" + "time" + + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/ethdb" + "github.com/ethereum/go-ethereum/log" +) + +// secureKeyPrefix is the database key prefix used to store trie node preimages. +var secureKeyPrefix = []byte("secure-key-") + +// secureKeyLength is the length of the above prefix + 32byte hash. +const secureKeyLength = 11 + 32 + +// DatabaseReader wraps the Get and Has method of a backing store for the trie. +type DatabaseReader interface { + // Get retrieves the value associated with key form the database. + Get(key []byte) (value []byte, err error) + + // Has retrieves whether a key is present in the database. + Has(key []byte) (bool, error) +} + +// Database is an intermediate write layer between the trie data structures and +// the disk database. The aim is to accumulate trie writes in-memory and only +// periodically flush a couple tries to disk, garbage collecting the remainder. +type Database struct { + diskdb ethdb.Database // Persistent storage for matured trie nodes + + nodes map[common.Hash]*cachedNode // Data and references relationships of a node + preimages map[common.Hash][]byte // Preimages of nodes from the secure trie + seckeybuf [secureKeyLength]byte // Ephemeral buffer for calculating preimage keys + + gctime time.Duration // Time spent on garbage collection since last commit + gcnodes uint64 // Nodes garbage collected since last commit + gcsize common.StorageSize // Data storage garbage collected since last commit + + nodesSize common.StorageSize // Storage size of the nodes cache + preimagesSize common.StorageSize // Storage size of the preimages cache + + lock sync.RWMutex +} + +// cachedNode is all the information we know about a single cached node in the +// memory database write layer. +type cachedNode struct { + blob []byte // Cached data block of the trie node + parents int // Number of live nodes referencing this one + children map[common.Hash]int // Children referenced by this nodes +} + +// NewDatabase creates a new trie database to store ephemeral trie content before +// its written out to disk or garbage collected. +func NewDatabase(diskdb ethdb.Database) *Database { + return &Database{ + diskdb: diskdb, + nodes: map[common.Hash]*cachedNode{ + {}: {children: make(map[common.Hash]int)}, + }, + preimages: make(map[common.Hash][]byte), + } +} + +// DiskDB retrieves the persistent storage backing the trie database. +func (db *Database) DiskDB() DatabaseReader { + return db.diskdb +} + +// Insert writes a new trie node to the memory database if it's yet unknown. The +// method will make a copy of the slice. +func (db *Database) Insert(hash common.Hash, blob []byte) { + db.lock.Lock() + defer db.lock.Unlock() + + db.insert(hash, blob) +} + +// insert is the private locked version of Insert. +func (db *Database) insert(hash common.Hash, blob []byte) { + if _, ok := db.nodes[hash]; ok { + return + } + db.nodes[hash] = &cachedNode{ + blob: common.CopyBytes(blob), + children: make(map[common.Hash]int), + } + db.nodesSize += common.StorageSize(common.HashLength + len(blob)) +} + +// insertPreimage writes a new trie node pre-image to the memory database if it's +// yet unknown. The method will make a copy of the slice. +// +// Note, this method assumes that the database's lock is held! +func (db *Database) insertPreimage(hash common.Hash, preimage []byte) { + if _, ok := db.preimages[hash]; ok { + return + } + db.preimages[hash] = common.CopyBytes(preimage) + db.preimagesSize += common.StorageSize(common.HashLength + len(preimage)) +} + +// Node retrieves a cached trie node from memory. If it cannot be found cached, +// the method queries the persistent database for the content. +func (db *Database) Node(hash common.Hash) ([]byte, error) { + // Retrieve the node from cache if available + db.lock.RLock() + node := db.nodes[hash] + db.lock.RUnlock() + + if node != nil { + return node.blob, nil + } + // Content unavailable in memory, attempt to retrieve from disk + return db.diskdb.Get(hash[:]) +} + +// preimage retrieves a cached trie node pre-image from memory. If it cannot be +// found cached, the method queries the persistent database for the content. +func (db *Database) preimage(hash common.Hash) ([]byte, error) { + // Retrieve the node from cache if available + db.lock.RLock() + preimage := db.preimages[hash] + db.lock.RUnlock() + + if preimage != nil { + return preimage, nil + } + // Content unavailable in memory, attempt to retrieve from disk + return db.diskdb.Get(db.secureKey(hash[:])) +} + +// secureKey returns the database key for the preimage of key, as an ephemeral +// buffer. The caller must not hold onto the return value because it will become +// invalid on the next call. +func (db *Database) secureKey(key []byte) []byte { + buf := append(db.seckeybuf[:0], secureKeyPrefix...) + buf = append(buf, key...) + return buf +} + +// Nodes retrieves the hashes of all the nodes cached within the memory database. +// This method is extremely expensive and should only be used to validate internal +// states in test code. +func (db *Database) Nodes() []common.Hash { + db.lock.RLock() + defer db.lock.RUnlock() + + var hashes = make([]common.Hash, 0, len(db.nodes)) + for hash := range db.nodes { + if hash != (common.Hash{}) { // Special case for "root" references/nodes + hashes = append(hashes, hash) + } + } + return hashes +} + +// Reference adds a new reference from a parent node to a child node. +func (db *Database) Reference(child common.Hash, parent common.Hash) { + db.lock.RLock() + defer db.lock.RUnlock() + + db.reference(child, parent) +} + +// reference is the private locked version of Reference. +func (db *Database) reference(child common.Hash, parent common.Hash) { + // If the node does not exist, it's a node pulled from disk, skip + node, ok := db.nodes[child] + if !ok { + return + } + // If the reference already exists, only duplicate for roots + if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) { + return + } + node.parents++ + db.nodes[parent].children[child]++ +} + +// Dereference removes an existing reference from a parent node to a child node. +func (db *Database) Dereference(child common.Hash, parent common.Hash) { + db.lock.Lock() + defer db.lock.Unlock() + + nodes, storage, start := len(db.nodes), db.nodesSize, time.Now() + db.dereference(child, parent) + + db.gcnodes += uint64(nodes - len(db.nodes)) + db.gcsize += storage - db.nodesSize + db.gctime += time.Since(start) + + log.Debug("Dereferenced trie from memory database", "nodes", nodes-len(db.nodes), "size", storage-db.nodesSize, "time", time.Since(start), + "gcnodes", db.gcnodes, "gcsize", db.gcsize, "gctime", db.gctime, "livenodes", len(db.nodes), "livesize", db.nodesSize) +} + +// dereference is the private locked version of Dereference. +func (db *Database) dereference(child common.Hash, parent common.Hash) { + // Dereference the parent-child + node := db.nodes[parent] + + node.children[child]-- + if node.children[child] == 0 { + delete(node.children, child) + } + // If the node does not exist, it's a previously committed node. + node, ok := db.nodes[child] + if !ok { + return + } + // If there are no more references to the child, delete it and cascade + node.parents-- + if node.parents == 0 { + for hash := range node.children { + db.dereference(hash, child) + } + delete(db.nodes, child) + db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob)) + } +} + +// Commit iterates over all the children of a particular node, writes them out +// to disk, forcefully tearing down all references in both directions. +// +// As a side effect, all pre-images accumulated up to this point are also written. +func (db *Database) Commit(node common.Hash, report bool) error { + // Create a database batch to flush persistent data out. It is important that + // outside code doesn't see an inconsistent state (referenced data removed from + // memory cache during commit but not yet in persistent storage). This is ensured + // by only uncaching existing data when the database write finalizes. + db.lock.RLock() + + start := time.Now() + batch := db.diskdb.NewBatch() + + // Move all of the accumulated preimages into a write batch + for hash, preimage := range db.preimages { + if err := batch.Put(db.secureKey(hash[:]), preimage); err != nil { + log.Error("Failed to commit preimage from trie database", "err", err) + db.lock.RUnlock() + return err + } + if batch.ValueSize() > ethdb.IdealBatchSize { + if err := batch.Write(); err != nil { + return err + } + batch.Reset() + } + } + // Move the trie itself into the batch, flushing if enough data is accumulated + nodes, storage := len(db.nodes), db.nodesSize+db.preimagesSize + if err := db.commit(node, batch); err != nil { + log.Error("Failed to commit trie from trie database", "err", err) + db.lock.RUnlock() + return err + } + // Write batch ready, unlock for readers during persistence + if err := batch.Write(); err != nil { + log.Error("Failed to write trie to disk", "err", err) + db.lock.RUnlock() + return err + } + db.lock.RUnlock() + + // Write successful, clear out the flushed data + db.lock.Lock() + defer db.lock.Unlock() + + db.preimages = make(map[common.Hash][]byte) + db.preimagesSize = 0 + + db.uncache(node) + + logger := log.Info + if !report { + logger = log.Debug + } + logger("Persisted trie from memory database", "nodes", nodes-len(db.nodes), "size", storage-db.nodesSize, "time", time.Since(start), + "gcnodes", db.gcnodes, "gcsize", db.gcsize, "gctime", db.gctime, "livenodes", len(db.nodes), "livesize", db.nodesSize) + + // Reset the garbage collection statistics + db.gcnodes, db.gcsize, db.gctime = 0, 0, 0 + + return nil +} + +// commit is the private locked version of Commit. +func (db *Database) commit(hash common.Hash, batch ethdb.Batch) error { + // If the node does not exist, it's a previously committed node + node, ok := db.nodes[hash] + if !ok { + return nil + } + for child := range node.children { + if err := db.commit(child, batch); err != nil { + return err + } + } + if err := batch.Put(hash[:], node.blob); err != nil { + return err + } + // If we've reached an optimal match size, commit and start over + if batch.ValueSize() >= ethdb.IdealBatchSize { + if err := batch.Write(); err != nil { + return err + } + batch.Reset() + } + return nil +} + +// uncache is the post-processing step of a commit operation where the already +// persisted trie is removed from the cache. The reason behind the two-phase +// commit is to ensure consistent data availability while moving from memory +// to disk. +func (db *Database) uncache(hash common.Hash) { + // If the node does not exist, we're done on this path + node, ok := db.nodes[hash] + if !ok { + return + } + // Otherwise uncache the node's subtries and remove the node itself too + for child := range node.children { + db.uncache(child) + } + delete(db.nodes, hash) + db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob)) +} + +// Size returns the current storage size of the memory cache in front of the +// persistent database layer. +func (db *Database) Size() common.StorageSize { + db.lock.RLock() + defer db.lock.RUnlock() + + return db.nodesSize + db.preimagesSize +} -- cgit v1.2.3