aboutsummaryrefslogtreecommitdiffstats
path: root/trie
diff options
context:
space:
mode:
authorPéter Szilágyi <peterke@gmail.com>2018-06-21 17:28:05 +0800
committerFelix Lange <fjl@users.noreply.github.com>2018-06-21 17:28:05 +0800
commitd926bf2c7e3182d694c15829a37a0ca7331cd03c (patch)
treec2d3ddd85941a231fb05de46c36703273d11814a /trie
parent8db8d074e2fff547e9d85169018e03f89b5975a1 (diff)
downloaddexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.gz
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.bz2
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.lz
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.xz
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.zst
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.zip
trie: cache collapsed tries node, not rlp blobs (#16876)
The current trie memory database/cache that we do pruning on stores trie nodes as binary rlp encoded blobs, and also stores the node relationships/references for GC purposes. However, most of the trie nodes (everything apart from a value node) is in essence just a collection of references. This PR switches out the RLP encoded trie blobs with the collapsed-but-not-serialized trie nodes. This permits most of the references to be recovered from within the node data structure, avoiding the need to track them a second time (expensive memory wise).
Diffstat (limited to 'trie')
-rw-r--r--trie/database.go274
-rw-r--r--trie/hasher.go30
-rw-r--r--trie/node.go15
-rw-r--r--trie/trie.go8
4 files changed, 260 insertions, 67 deletions
diff --git a/trie/database.go b/trie/database.go
index 468c139df..88c6e9cd6 100644
--- a/trie/database.go
+++ b/trie/database.go
@@ -17,6 +17,8 @@
package trie
import (
+ "fmt"
+ "io"
"sync"
"time"
@@ -24,6 +26,7 @@ import (
"github.com/ethereum/go-ethereum/ethdb"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/metrics"
+ "github.com/ethereum/go-ethereum/rlp"
)
var (
@@ -82,25 +85,188 @@ type Database struct {
lock sync.RWMutex
}
+// rawNode is a simple binary blob used to differentiate between collapsed trie
+// nodes and already encoded RLP binary blobs (while at the same time store them
+// in the same cache fields).
+type rawNode []byte
+
+func (n rawNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
+func (n rawNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") }
+func (n rawNode) fstring(ind string) string { panic("this should never end up in a live trie") }
+
+// rawFullNode represents only the useful data content of a full node, with the
+// caches and flags stripped out to minimize its data storage. This type honors
+// the same RLP encoding as the original parent.
+type rawFullNode [17]node
+
+func (n rawFullNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
+func (n rawFullNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") }
+func (n rawFullNode) fstring(ind string) string { panic("this should never end up in a live trie") }
+
+func (n rawFullNode) EncodeRLP(w io.Writer) error {
+ var nodes [17]node
+
+ for i, child := range n {
+ if child != nil {
+ nodes[i] = child
+ } else {
+ nodes[i] = nilValueNode
+ }
+ }
+ return rlp.Encode(w, nodes)
+}
+
+// rawShortNode represents only the useful data content of a short node, with the
+// caches and flags stripped out to minimize its data storage. This type honors
+// the same RLP encoding as the original parent.
+type rawShortNode struct {
+ Key []byte
+ Val node
+}
+
+func (n rawShortNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
+func (n rawShortNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") }
+func (n rawShortNode) fstring(ind string) string { panic("this should never end up in a live trie") }
+
// cachedNode is all the information we know about a single cached node in the
// memory database write layer.
type cachedNode struct {
- blob []byte // Cached data block of the trie node
- parents int // Number of live nodes referencing this one
- children map[common.Hash]int // Children referenced by this nodes
+ node node // Cached collapsed trie node, or raw rlp data
+ size uint16 // Byte size of the useful cached data
+
+ parents uint16 // Number of live nodes referencing this one
+ children map[common.Hash]uint16 // External children referenced by this node
flushPrev common.Hash // Previous node in the flush-list
flushNext common.Hash // Next node in the flush-list
}
+// rlp returns the raw rlp encoded blob of the cached node, either directly from
+// the cache, or by regenerating it from the collapsed node.
+func (n *cachedNode) rlp() []byte {
+ if node, ok := n.node.(rawNode); ok {
+ return node
+ }
+ blob, err := rlp.EncodeToBytes(n.node)
+ if err != nil {
+ panic(err)
+ }
+ return blob
+}
+
+// obj returns the decoded and expanded trie node, either directly from the cache,
+// or by regenerating it from the rlp encoded blob.
+func (n *cachedNode) obj(hash common.Hash, cachegen uint16) node {
+ if node, ok := n.node.(rawNode); ok {
+ return mustDecodeNode(hash[:], node, cachegen)
+ }
+ return expandNode(hash[:], n.node, cachegen)
+}
+
+// childs returns all the tracked children of this node, both the implicit ones
+// from inside the node as well as the explicit ones from outside the node.
+func (n *cachedNode) childs() []common.Hash {
+ children := make([]common.Hash, 0, 16)
+ for child := range n.children {
+ children = append(children, child)
+ }
+ if _, ok := n.node.(rawNode); !ok {
+ gatherChildren(n.node, &children)
+ }
+ return children
+}
+
+// gatherChildren traverses the node hierarchy of a collapsed storage node and
+// retrieves all the hashnode children.
+func gatherChildren(n node, children *[]common.Hash) {
+ switch n := n.(type) {
+ case *rawShortNode:
+ gatherChildren(n.Val, children)
+
+ case rawFullNode:
+ for i := 0; i < 16; i++ {
+ gatherChildren(n[i], children)
+ }
+ case hashNode:
+ *children = append(*children, common.BytesToHash(n))
+
+ case valueNode, nil:
+
+ default:
+ panic(fmt.Sprintf("unknown node type: %T", n))
+ }
+}
+
+// simplifyNode traverses the hierarchy of an expanded memory node and discards
+// all the internal caches, returning a node that only contains the raw data.
+func simplifyNode(n node) node {
+ switch n := n.(type) {
+ case *shortNode:
+ // Short nodes discard the flags and cascade
+ return &rawShortNode{Key: n.Key, Val: simplifyNode(n.Val)}
+
+ case *fullNode:
+ // Full nodes discard the flags and cascade
+ node := rawFullNode(n.Children)
+ for i := 0; i < len(node); i++ {
+ if node[i] != nil {
+ node[i] = simplifyNode(node[i])
+ }
+ }
+ return node
+
+ case valueNode, hashNode, rawNode:
+ return n
+
+ default:
+ panic(fmt.Sprintf("unknown node type: %T", n))
+ }
+}
+
+// expandNode traverses the node hierarchy of a collapsed storage node and converts
+// all fields and keys into expanded memory form.
+func expandNode(hash hashNode, n node, cachegen uint16) node {
+ switch n := n.(type) {
+ case *rawShortNode:
+ // Short nodes need key and child expansion
+ return &shortNode{
+ Key: compactToHex(n.Key),
+ Val: expandNode(nil, n.Val, cachegen),
+ flags: nodeFlag{
+ hash: hash,
+ gen: cachegen,
+ },
+ }
+
+ case rawFullNode:
+ // Full nodes need child expansion
+ node := &fullNode{
+ flags: nodeFlag{
+ hash: hash,
+ gen: cachegen,
+ },
+ }
+ for i := 0; i < len(node.Children); i++ {
+ if n[i] != nil {
+ node.Children[i] = expandNode(nil, n[i], cachegen)
+ }
+ }
+ return node
+
+ case valueNode, hashNode:
+ return n
+
+ default:
+ panic(fmt.Sprintf("unknown node type: %T", n))
+ }
+}
+
// NewDatabase creates a new trie database to store ephemeral trie content before
// its written out to disk or garbage collected.
func NewDatabase(diskdb ethdb.Database) *Database {
return &Database{
- diskdb: diskdb,
- nodes: map[common.Hash]*cachedNode{
- {}: {children: make(map[common.Hash]int)},
- },
+ diskdb: diskdb,
+ nodes: map[common.Hash]*cachedNode{{}: {}},
preimages: make(map[common.Hash][]byte),
}
}
@@ -110,33 +276,46 @@ func (db *Database) DiskDB() DatabaseReader {
return db.diskdb
}
-// Insert writes a new trie node to the memory database if it's yet unknown. The
-// method will make a copy of the slice.
-func (db *Database) Insert(hash common.Hash, blob []byte) {
+// InsertBlob writes a new reference tracked blob to the memory database if it's
+// yet unknown. This method should only be used for non-trie nodes that require
+// reference counting, since trie nodes are garbage collected directly through
+// their embedded children.
+func (db *Database) InsertBlob(hash common.Hash, blob []byte) {
db.lock.Lock()
defer db.lock.Unlock()
- db.insert(hash, blob)
+ db.insert(hash, blob, rawNode(blob))
}
-// insert is the private locked version of Insert.
-func (db *Database) insert(hash common.Hash, blob []byte) {
+// insert inserts a collapsed trie node into the memory database. This method is
+// a more generic version of InsertBlob, supporting both raw blob insertions as
+// well ex trie node insertions. The blob must always be specified to allow proper
+// size tracking.
+func (db *Database) insert(hash common.Hash, blob []byte, node node) {
// If the node's already cached, skip
if _, ok := db.nodes[hash]; ok {
return
}
- db.nodes[hash] = &cachedNode{
- blob: common.CopyBytes(blob),
- children: make(map[common.Hash]int),
+ // Create the cached entry for this node
+ entry := &cachedNode{
+ node: simplifyNode(node),
+ size: uint16(len(blob)),
flushPrev: db.newest,
}
+ for _, child := range entry.childs() {
+ if c := db.nodes[child]; c != nil {
+ c.parents++
+ }
+ }
+ db.nodes[hash] = entry
+
// Update the flush-list endpoints
if db.oldest == (common.Hash{}) {
db.oldest, db.newest = hash, hash
} else {
db.nodes[db.newest].flushNext, db.newest = hash, hash
}
- db.nodesSize += common.StorageSize(common.HashLength + len(blob))
+ db.nodesSize += common.StorageSize(common.HashLength + entry.size)
}
// insertPreimage writes a new trie node pre-image to the memory database if it's
@@ -151,8 +330,27 @@ func (db *Database) insertPreimage(hash common.Hash, preimage []byte) {
db.preimagesSize += common.StorageSize(common.HashLength + len(preimage))
}
-// Node retrieves a cached trie node from memory. If it cannot be found cached,
-// the method queries the persistent database for the content.
+// node retrieves a cached trie node from memory, or returns nil if none can be
+// found in the memory cache.
+func (db *Database) node(hash common.Hash, cachegen uint16) node {
+ // Retrieve the node from cache if available
+ db.lock.RLock()
+ node := db.nodes[hash]
+ db.lock.RUnlock()
+
+ if node != nil {
+ return node.obj(hash, cachegen)
+ }
+ // Content unavailable in memory, attempt to retrieve from disk
+ enc, err := db.diskdb.Get(hash[:])
+ if err != nil || enc == nil {
+ return nil
+ }
+ return mustDecodeNode(hash[:], enc, cachegen)
+}
+
+// Node retrieves an encoded cached trie node from memory. If it cannot be found
+// cached, the method queries the persistent database for the content.
func (db *Database) Node(hash common.Hash) ([]byte, error) {
// Retrieve the node from cache if available
db.lock.RLock()
@@ -160,7 +358,7 @@ func (db *Database) Node(hash common.Hash) ([]byte, error) {
db.lock.RUnlock()
if node != nil {
- return node.blob, nil
+ return node.rlp(), nil
}
// Content unavailable in memory, attempt to retrieve from disk
return db.diskdb.Get(hash[:])
@@ -222,20 +420,22 @@ func (db *Database) reference(child common.Hash, parent common.Hash) {
return
}
// If the reference already exists, only duplicate for roots
- if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) {
+ if db.nodes[parent].children == nil {
+ db.nodes[parent].children = make(map[common.Hash]uint16)
+ } else if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) {
return
}
node.parents++
db.nodes[parent].children[child]++
}
-// Dereference removes an existing reference from a parent node to a child node.
-func (db *Database) Dereference(child common.Hash, parent common.Hash) {
+// Dereference removes an existing reference from a root node.
+func (db *Database) Dereference(root common.Hash) {
db.lock.Lock()
defer db.lock.Unlock()
nodes, storage, start := len(db.nodes), db.nodesSize, time.Now()
- db.dereference(child, parent)
+ db.dereference(root, common.Hash{})
db.gcnodes += uint64(nodes - len(db.nodes))
db.gcsize += storage - db.nodesSize
@@ -254,9 +454,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) {
// Dereference the parent-child
node := db.nodes[parent]
- node.children[child]--
- if node.children[child] == 0 {
- delete(node.children, child)
+ if node.children != nil && node.children[child] > 0 {
+ node.children[child]--
+ if node.children[child] == 0 {
+ delete(node.children, child)
+ }
}
// If the child does not exist, it's a previously committed node.
node, ok := db.nodes[child]
@@ -274,11 +476,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) {
db.nodes[node.flushNext].flushPrev = node.flushPrev
}
// Dereference all children and delete the node
- for hash := range node.children {
+ for _, hash := range node.childs() {
db.dereference(hash, child)
}
delete(db.nodes, child)
- db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
+ db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
}
}
@@ -323,7 +525,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
for size > limit && oldest != (common.Hash{}) {
// Fetch the oldest referenced node and push into the batch
node := db.nodes[oldest]
- if err := batch.Put(oldest[:], node.blob); err != nil {
+ if err := batch.Put(oldest[:], node.rlp()); err != nil {
db.lock.RUnlock()
return err
}
@@ -340,7 +542,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
// is the total size, including both the useful cached data (hash -> blob), as
// well as the flushlist metadata (2*hash). When flushing items from the cache,
// we need to reduce both.
- size -= common.StorageSize(3*common.HashLength + len(node.blob))
+ size -= common.StorageSize(3*common.HashLength + int(node.size))
oldest = node.flushNext
}
// Flush out any remainder data from the last batch
@@ -364,7 +566,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
delete(db.nodes, db.oldest)
db.oldest = node.flushNext
- db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
+ db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
}
if db.oldest != (common.Hash{}) {
db.nodes[db.oldest].flushPrev = common.Hash{}
@@ -460,12 +662,12 @@ func (db *Database) commit(hash common.Hash, batch ethdb.Batch) error {
if !ok {
return nil
}
- for child := range node.children {
+ for _, child := range node.childs() {
if err := db.commit(child, batch); err != nil {
return err
}
}
- if err := batch.Put(hash[:], node.blob); err != nil {
+ if err := batch.Put(hash[:], node.rlp()); err != nil {
return err
}
// If we've reached an optimal batch size, commit and start over
@@ -496,11 +698,11 @@ func (db *Database) uncache(hash common.Hash) {
db.nodes[node.flushNext].flushPrev = node.flushPrev
}
// Uncache the node's subtries and remove the node itself too
- for child := range node.children {
+ for _, child := range node.childs() {
db.uncache(child)
}
delete(db.nodes, hash)
- db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
+ db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
}
// Size returns the current storage size of the memory cache in front of the
diff --git a/trie/hasher.go b/trie/hasher.go
index 47c6dd8f9..7b1d7793f 100644
--- a/trie/hasher.go
+++ b/trie/hasher.go
@@ -137,9 +137,6 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) {
return original, original, err
}
}
- if collapsed.Val == nil {
- collapsed.Val = valueNode(nil) // Ensure that nil children are encoded as empty strings.
- }
return collapsed, cached, nil
case *fullNode:
@@ -152,14 +149,9 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) {
if err != nil {
return original, original, err
}
- } else {
- collapsed.Children[i] = valueNode(nil) // Ensure that nil children are encoded as empty strings.
}
}
cached.Children[16] = n.Children[16]
- if collapsed.Children[16] == nil {
- collapsed.Children[16] = valueNode(nil)
- }
return collapsed, cached, nil
default:
@@ -192,34 +184,22 @@ func (h *hasher) store(n node, db *Database, force bool) (node, error) {
if db != nil {
// We are pooling the trie nodes into an intermediate memory cache
- db.lock.Lock()
hash := common.BytesToHash(hash)
- db.insert(hash, h.tmp)
- // Track all direct parent->child node references
- switch n := n.(type) {
- case *shortNode:
- if child, ok := n.Val.(hashNode); ok {
- db.reference(common.BytesToHash(child), hash)
- }
- case *fullNode:
- for i := 0; i < 16; i++ {
- if child, ok := n.Children[i].(hashNode); ok {
- db.reference(common.BytesToHash(child), hash)
- }
- }
- }
+
+ db.lock.Lock()
+ db.insert(hash, h.tmp, n)
db.lock.Unlock()
// Track external references from account->storage trie
if h.onleaf != nil {
switch n := n.(type) {
case *shortNode:
- if child, ok := n.Val.(valueNode); ok && child != nil {
+ if child, ok := n.Val.(valueNode); ok {
h.onleaf(child, hash)
}
case *fullNode:
for i := 0; i < 16; i++ {
- if child, ok := n.Children[i].(valueNode); ok && child != nil {
+ if child, ok := n.Children[i].(valueNode); ok {
h.onleaf(child, hash)
}
}
diff --git a/trie/node.go b/trie/node.go
index 02815042c..a06f1b389 100644
--- a/trie/node.go
+++ b/trie/node.go
@@ -47,9 +47,22 @@ type (
valueNode []byte
)
+// nilValueNode is used when collapsing internal trie nodes for hashing, since
+// unset children need to serialize correctly.
+var nilValueNode = valueNode(nil)
+
// EncodeRLP encodes a full node into the consensus RLP format.
func (n *fullNode) EncodeRLP(w io.Writer) error {
- return rlp.Encode(w, n.Children)
+ var nodes [17]node
+
+ for i, child := range n.Children {
+ if child != nil {
+ nodes[i] = child
+ } else {
+ nodes[i] = nilValueNode
+ }
+ }
+ return rlp.Encode(w, nodes)
}
func (n *fullNode) copy() *fullNode { copy := *n; return &copy }
diff --git a/trie/trie.go b/trie/trie.go
index 30543c549..4284e30ad 100644
--- a/trie/trie.go
+++ b/trie/trie.go
@@ -433,12 +433,10 @@ func (t *Trie) resolveHash(n hashNode, prefix []byte) (node, error) {
cacheMissCounter.Inc(1)
hash := common.BytesToHash(n)
-
- enc, err := t.db.Node(hash)
- if err != nil || enc == nil {
- return nil, &MissingNodeError{NodeHash: hash, Path: prefix}
+ if node := t.db.node(hash, t.cachegen); node != nil {
+ return node, nil
}
- return mustDecodeNode(n, enc, t.cachegen), nil
+ return nil, &MissingNodeError{NodeHash: hash, Path: prefix}
}
// Root returns the root hash of the trie.