trie: cache collapsed tries node, not rlp blobs (#16876)

The current trie memory database/cache that we do pruning on stores trie nodes as binary rlp encoded blobs, and also stores the node relationships/references for GC purposes. However, most of the trie nodes (everything apart from a value node) is in essence just a collection of references. This PR switches out the RLP encoded trie blobs with the collapsed-but-not-serialized trie nodes. This permits most of the references to be recovered from within the node data structure, avoiding the need to track them a second time (expensive memory wise).
author: Péter Szilágyi <peterke@gmail.com> 2018-06-21 17:28:05 +0800
committer: Felix Lange <fjl@users.noreply.github.com> 2018-06-21 17:28:05 +0800
commit: d926bf2c7e3182d694c15829a37a0ca7331cd03c (patch)
tree: c2d3ddd85941a231fb05de46c36703273d11814a /trie
parent: 8db8d074e2fff547e9d85169018e03f89b5975a1 (diff)
download: dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.gz
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.bz2
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.lz
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.xz
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.zst
dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.zip
4 files changed, 260 insertions, 67 deletions
diff --git a/trie/database.go b/trie/database.go
index 468c139df..88c6e9cd6 100644
--- a/trie/database.go
+++ b/trie/database.go
@@ -17,6 +17,8 @@
 package trie
 
 import (
+	"fmt"
+	"io"
 	"sync"
 	"time"
 
@@ -24,6 +26,7 @@ import (
 	"github.com/ethereum/go-ethereum/ethdb"
 	"github.com/ethereum/go-ethereum/log"
 	"github.com/ethereum/go-ethereum/metrics"
+	"github.com/ethereum/go-ethereum/rlp"
 )
 
 var (
@@ -82,25 +85,188 @@ type Database struct {
 	lock sync.RWMutex
 }
 
+// rawNode is a simple binary blob used to differentiate between collapsed trie
+// nodes and already encoded RLP binary blobs (while at the same time store them
+// in the same cache fields).
+type rawNode []byte
+
+func (n rawNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
+func (n rawNode) cache() (hashNode, bool)       { panic("this should never end up in a live trie") }
+func (n rawNode) fstring(ind string) string     { panic("this should never end up in a live trie") }
+
+// rawFullNode represents only the useful data content of a full node, with the
+// caches and flags stripped out to minimize its data storage. This type honors
+// the same RLP encoding as the original parent.
+type rawFullNode [17]node
+
+func (n rawFullNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
+func (n rawFullNode) cache() (hashNode, bool)       { panic("this should never end up in a live trie") }
+func (n rawFullNode) fstring(ind string) string     { panic("this should never end up in a live trie") }
+
+func (n rawFullNode) EncodeRLP(w io.Writer) error {
+	var nodes [17]node
+
+	for i, child := range n {
+		if child != nil {
+			nodes[i] = child
+		} else {
+			nodes[i] = nilValueNode
+		}
+	}
+	return rlp.Encode(w, nodes)
+}
+
+// rawShortNode represents only the useful data content of a short node, with the
+// caches and flags stripped out to minimize its data storage. This type honors
+// the same RLP encoding as the original parent.
+type rawShortNode struct {
+	Key []byte
+	Val node
+}
+
+func (n rawShortNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") }
+func (n rawShortNode) cache() (hashNode, bool)       { panic("this should never end up in a live trie") }
+func (n rawShortNode) fstring(ind string) string     { panic("this should never end up in a live trie") }
+
 // cachedNode is all the information we know about a single cached node in the
 // memory database write layer.
 type cachedNode struct {
-	blob     []byte              // Cached data block of the trie node
-	parents  int                 // Number of live nodes referencing this one
-	children map[common.Hash]int // Children referenced by this nodes
+	node node   // Cached collapsed trie node, or raw rlp data
+	size uint16 // Byte size of the useful cached data
+
+	parents  uint16                 // Number of live nodes referencing this one
+	children map[common.Hash]uint16 // External children referenced by this node
 
 	flushPrev common.Hash // Previous node in the flush-list
 	flushNext common.Hash // Next node in the flush-list
 }
 
+// rlp returns the raw rlp encoded blob of the cached node, either directly from
+// the cache, or by regenerating it from the collapsed node.
+func (n *cachedNode) rlp() []byte {
+	if node, ok := n.node.(rawNode); ok {
+		return node
+	}
+	blob, err := rlp.EncodeToBytes(n.node)
+	if err != nil {
+		panic(err)
+	}
+	return blob
+}
+
+// obj returns the decoded and expanded trie node, either directly from the cache,
+// or by regenerating it from the rlp encoded blob.
+func (n *cachedNode) obj(hash common.Hash, cachegen uint16) node {
+	if node, ok := n.node.(rawNode); ok {
+		return mustDecodeNode(hash[:], node, cachegen)
+	}
+	return expandNode(hash[:], n.node, cachegen)
+}
+
+// childs returns all the tracked children of this node, both the implicit ones
+// from inside the node as well as the explicit ones from outside the node.
+func (n *cachedNode) childs() []common.Hash {
+	children := make([]common.Hash, 0, 16)
+	for child := range n.children {
+		children = append(children, child)
+	}
+	if _, ok := n.node.(rawNode); !ok {
+		gatherChildren(n.node, &children)
+	}
+	return children
+}
+
+// gatherChildren traverses the node hierarchy of a collapsed storage node and
+// retrieves all the hashnode children.
+func gatherChildren(n node, children *[]common.Hash) {
+	switch n := n.(type) {
+	case *rawShortNode:
+		gatherChildren(n.Val, children)
+
+	case rawFullNode:
+		for i := 0; i < 16; i++ {
+			gatherChildren(n[i], children)
+		}
+	case hashNode:
+		*children = append(*children, common.BytesToHash(n))
+
+	case valueNode, nil:
+
+	default:
+		panic(fmt.Sprintf("unknown node type: %T", n))
+	}
+}
+
+// simplifyNode traverses the hierarchy of an expanded memory node and discards
+// all the internal caches, returning a node that only contains the raw data.
+func simplifyNode(n node) node {
+	switch n := n.(type) {
+	case *shortNode:
+		// Short nodes discard the flags and cascade
+		return &rawShortNode{Key: n.Key, Val: simplifyNode(n.Val)}
+
+	case *fullNode:
+		// Full nodes discard the flags and cascade
+		node := rawFullNode(n.Children)
+		for i := 0; i < len(node); i++ {
+			if node[i] != nil {
+				node[i] = simplifyNode(node[i])
+			}
+		}
+		return node
+
+	case valueNode, hashNode, rawNode:
+		return n
+
+	default:
+		panic(fmt.Sprintf("unknown node type: %T", n))
+	}
+}
+
+// expandNode traverses the node hierarchy of a collapsed storage node and converts
+// all fields and keys into expanded memory form.
+func expandNode(hash hashNode, n node, cachegen uint16) node {
+	switch n := n.(type) {
+	case *rawShortNode:
+		// Short nodes need key and child expansion
+		return &shortNode{
+			Key: compactToHex(n.Key),
+			Val: expandNode(nil, n.Val, cachegen),
+			flags: nodeFlag{
+				hash: hash,
+				gen:  cachegen,
+			},
+		}
+
+	case rawFullNode:
+		// Full nodes need child expansion
+		node := &fullNode{
+			flags: nodeFlag{
+				hash: hash,
+				gen:  cachegen,
+			},
+		}
+		for i := 0; i < len(node.Children); i++ {
+			if n[i] != nil {
+				node.Children[i] = expandNode(nil, n[i], cachegen)
+			}
+		}
+		return node
+
+	case valueNode, hashNode:
+		return n
+
+	default:
+		panic(fmt.Sprintf("unknown node type: %T", n))
+	}
+}
+
 // NewDatabase creates a new trie database to store ephemeral trie content before
 // its written out to disk or garbage collected.
 func NewDatabase(diskdb ethdb.Database) *Database {
 	return &Database{
-		diskdb: diskdb,
-		nodes: map[common.Hash]*cachedNode{
-			{}: {children: make(map[common.Hash]int)},
-		},
+		diskdb:    diskdb,
+		nodes:     map[common.Hash]*cachedNode{{}: {}},
 		preimages: make(map[common.Hash][]byte),
 	}
 }
@@ -110,33 +276,46 @@ func (db *Database) DiskDB() DatabaseReader {
 	return db.diskdb
 }
 
-// Insert writes a new trie node to the memory database if it's yet unknown. The
-// method will make a copy of the slice.
-func (db *Database) Insert(hash common.Hash, blob []byte) {
+// InsertBlob writes a new reference tracked blob to the memory database if it's
+// yet unknown. This method should only be used for non-trie nodes that require
+// reference counting, since trie nodes are garbage collected directly through
+// their embedded children.
+func (db *Database) InsertBlob(hash common.Hash, blob []byte) {
 	db.lock.Lock()
 	defer db.lock.Unlock()
 
-	db.insert(hash, blob)
+	db.insert(hash, blob, rawNode(blob))
 }
 
-// insert is the private locked version of Insert.
-func (db *Database) insert(hash common.Hash, blob []byte) {
+// insert inserts a collapsed trie node into the memory database. This method is
+// a more generic version of InsertBlob, supporting both raw blob insertions as
+// well ex trie node insertions. The blob must always be specified to allow proper
+// size tracking.
+func (db *Database) insert(hash common.Hash, blob []byte, node node) {
 	// If the node's already cached, skip
 	if _, ok := db.nodes[hash]; ok {
 		return
 	}
-	db.nodes[hash] = &cachedNode{
-		blob:      common.CopyBytes(blob),
-		children:  make(map[common.Hash]int),
+	// Create the cached entry for this node
+	entry := &cachedNode{
+		node:      simplifyNode(node),
+		size:      uint16(len(blob)),
 		flushPrev: db.newest,
 	}
+	for _, child := range entry.childs() {
+		if c := db.nodes[child]; c != nil {
+			c.parents++
+		}
+	}
+	db.nodes[hash] = entry
+
 	// Update the flush-list endpoints
 	if db.oldest == (common.Hash{}) {
 		db.oldest, db.newest = hash, hash
 	} else {
 		db.nodes[db.newest].flushNext, db.newest = hash, hash
 	}
-	db.nodesSize += common.StorageSize(common.HashLength + len(blob))
+	db.nodesSize += common.StorageSize(common.HashLength + entry.size)
 }
 
 // insertPreimage writes a new trie node pre-image to the memory database if it's
@@ -151,8 +330,27 @@ func (db *Database) insertPreimage(hash common.Hash, preimage []byte) {
 	db.preimagesSize += common.StorageSize(common.HashLength + len(preimage))
 }
 
-// Node retrieves a cached trie node from memory. If it cannot be found cached,
-// the method queries the persistent database for the content.
+// node retrieves a cached trie node from memory, or returns nil if none can be
+// found in the memory cache.
+func (db *Database) node(hash common.Hash, cachegen uint16) node {
+	// Retrieve the node from cache if available
+	db.lock.RLock()
+	node := db.nodes[hash]
+	db.lock.RUnlock()
+
+	if node != nil {
+		return node.obj(hash, cachegen)
+	}
+	// Content unavailable in memory, attempt to retrieve from disk
+	enc, err := db.diskdb.Get(hash[:])
+	if err != nil || enc == nil {
+		return nil
+	}
+	return mustDecodeNode(hash[:], enc, cachegen)
+}
+
+// Node retrieves an encoded cached trie node from memory. If it cannot be found
+// cached, the method queries the persistent database for the content.
 func (db *Database) Node(hash common.Hash) ([]byte, error) {
 	// Retrieve the node from cache if available
 	db.lock.RLock()
@@ -160,7 +358,7 @@ func (db *Database) Node(hash common.Hash) ([]byte, error) {
 	db.lock.RUnlock()
 
 	if node != nil {
-		return node.blob, nil
+		return node.rlp(), nil
 	}
 	// Content unavailable in memory, attempt to retrieve from disk
 	return db.diskdb.Get(hash[:])
@@ -222,20 +420,22 @@ func (db *Database) reference(child common.Hash, parent common.Hash) {
 		return
 	}
 	// If the reference already exists, only duplicate for roots
-	if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) {
+	if db.nodes[parent].children == nil {
+		db.nodes[parent].children = make(map[common.Hash]uint16)
+	} else if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) {
 		return
 	}
 	node.parents++
 	db.nodes[parent].children[child]++
 }
 
-// Dereference removes an existing reference from a parent node to a child node.
-func (db *Database) Dereference(child common.Hash, parent common.Hash) {
+// Dereference removes an existing reference from a root node.
+func (db *Database) Dereference(root common.Hash) {
 	db.lock.Lock()
 	defer db.lock.Unlock()
 
 	nodes, storage, start := len(db.nodes), db.nodesSize, time.Now()
-	db.dereference(child, parent)
+	db.dereference(root, common.Hash{})
 
 	db.gcnodes += uint64(nodes - len(db.nodes))
 	db.gcsize += storage - db.nodesSize
@@ -254,9 +454,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) {
 	// Dereference the parent-child
 	node := db.nodes[parent]
 
-	node.children[child]--
-	if node.children[child] == 0 {
-		delete(node.children, child)
+	if node.children != nil && node.children[child] > 0 {
+		node.children[child]--
+		if node.children[child] == 0 {
+			delete(node.children, child)
+		}
 	}
 	// If the child does not exist, it's a previously committed node.
 	node, ok := db.nodes[child]
@@ -274,11 +476,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) {
 			db.nodes[node.flushNext].flushPrev = node.flushPrev
 		}
 		// Dereference all children and delete the node
-		for hash := range node.children {
+		for _, hash := range node.childs() {
 			db.dereference(hash, child)
 		}
 		delete(db.nodes, child)
-		db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
+		db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
 	}
 }
 
@@ -323,7 +525,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
 	for size > limit && oldest != (common.Hash{}) {
 		// Fetch the oldest referenced node and push into the batch
 		node := db.nodes[oldest]
-		if err := batch.Put(oldest[:], node.blob); err != nil {
+		if err := batch.Put(oldest[:], node.rlp()); err != nil {
 			db.lock.RUnlock()
 			return err
 		}
@@ -340,7 +542,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
 		// is the total size, including both the useful cached data (hash -> blob), as
 		// well as the flushlist metadata (2*hash). When flushing items from the cache,
 		// we need to reduce both.
-		size -= common.StorageSize(3*common.HashLength + len(node.blob))
+		size -= common.StorageSize(3*common.HashLength + int(node.size))
 		oldest = node.flushNext
 	}
 	// Flush out any remainder data from the last batch
@@ -364,7 +566,7 @@ func (db *Database) Cap(limit common.StorageSize) error {
 		delete(db.nodes, db.oldest)
 		db.oldest = node.flushNext
 
-		db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
+		db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
 	}
 	if db.oldest != (common.Hash{}) {
 		db.nodes[db.oldest].flushPrev = common.Hash{}
@@ -460,12 +662,12 @@ func (db *Database) commit(hash common.Hash, batch ethdb.Batch) error {
 	if !ok {
 		return nil
 	}
-	for child := range node.children {
+	for _, child := range node.childs() {
 		if err := db.commit(child, batch); err != nil {
 			return err
 		}
 	}
-	if err := batch.Put(hash[:], node.blob); err != nil {
+	if err := batch.Put(hash[:], node.rlp()); err != nil {
 		return err
 	}
 	// If we've reached an optimal batch size, commit and start over
@@ -496,11 +698,11 @@ func (db *Database) uncache(hash common.Hash) {
 		db.nodes[node.flushNext].flushPrev = node.flushPrev
 	}
 	// Uncache the node's subtries and remove the node itself too
-	for child := range node.children {
+	for _, child := range node.childs() {
 		db.uncache(child)
 	}
 	delete(db.nodes, hash)
-	db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob))
+	db.nodesSize -= common.StorageSize(common.HashLength + int(node.size))
 }
 
 // Size returns the current storage size of the memory cache in front of the
diff --git a/trie/hasher.go b/trie/hasher.go
index 47c6dd8f9..7b1d7793f 100644
--- a/trie/hasher.go
+++ b/trie/hasher.go
@@ -137,9 +137,6 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) {
 				return original, original, err
 			}
 		}
-		if collapsed.Val == nil {
-			collapsed.Val = valueNode(nil) // Ensure that nil children are encoded as empty strings.
-		}
 		return collapsed, cached, nil
 
 	case *fullNode:
@@ -152,14 +149,9 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) {
 				if err != nil {
 					return original, original, err
 				}
-			} else {
-				collapsed.Children[i] = valueNode(nil) // Ensure that nil children are encoded as empty strings.
 			}
 		}
 		cached.Children[16] = n.Children[16]
-		if collapsed.Children[16] == nil {
-			collapsed.Children[16] = valueNode(nil)
-		}
 		return collapsed, cached, nil
 
 	default:
@@ -192,34 +184,22 @@ func (h *hasher) store(n node, db *Database, force bool) (node, error) {
 
 	if db != nil {
 		// We are pooling the trie nodes into an intermediate memory cache
-		db.lock.Lock()
 		hash := common.BytesToHash(hash)
-		db.insert(hash, h.tmp)
-		// Track all direct parent->child node references
-		switch n := n.(type) {
-		case *shortNode:
-			if child, ok := n.Val.(hashNode); ok {
-				db.reference(common.BytesToHash(child), hash)
-			}
-		case *fullNode:
-			for i := 0; i < 16; i++ {
-				if child, ok := n.Children[i].(hashNode); ok {
-					db.reference(common.BytesToHash(child), hash)
-				}
-			}
-		}
+
+		db.lock.Lock()
+		db.insert(hash, h.tmp, n)
 		db.lock.Unlock()
 
 		// Track external references from account->storage trie
 		if h.onleaf != nil {
 			switch n := n.(type) {
 			case *shortNode:
-				if child, ok := n.Val.(valueNode); ok && child != nil {
+				if child, ok := n.Val.(valueNode); ok {
 					h.onleaf(child, hash)
 				}
 			case *fullNode:
 				for i := 0; i < 16; i++ {
-					if child, ok := n.Children[i].(valueNode); ok && child != nil {
+					if child, ok := n.Children[i].(valueNode); ok {
 						h.onleaf(child, hash)
 					}
 				}
diff --git a/trie/node.go b/trie/node.go
index 02815042c..a06f1b389 100644
--- a/trie/node.go
+++ b/trie/node.go
@@ -47,9 +47,22 @@ type (
 	valueNode []byte
 )
 
+// nilValueNode is used when collapsing internal trie nodes for hashing, since
+// unset children need to serialize correctly.
+var nilValueNode = valueNode(nil)
+
 // EncodeRLP encodes a full node into the consensus RLP format.
 func (n *fullNode) EncodeRLP(w io.Writer) error {
-	return rlp.Encode(w, n.Children)
+	var nodes [17]node
+
+	for i, child := range n.Children {
+		if child != nil {
+			nodes[i] = child
+		} else {
+			nodes[i] = nilValueNode
+		}
+	}
+	return rlp.Encode(w, nodes)
 }
 
 func (n *fullNode) copy() *fullNode   { copy := *n; return &copy }
diff --git a/trie/trie.go b/trie/trie.go
index 30543c549..4284e30ad 100644
--- a/trie/trie.go
+++ b/trie/trie.go
@@ -433,12 +433,10 @@ func (t *Trie) resolveHash(n hashNode, prefix []byte) (node, error) {
 	cacheMissCounter.Inc(1)
 
 	hash := common.BytesToHash(n)
-
-	enc, err := t.db.Node(hash)
-	if err != nil || enc == nil {
-		return nil, &MissingNodeError{NodeHash: hash, Path: prefix}
+	if node := t.db.node(hash, t.cachegen); node != nil {
+		return node, nil
 	}
-	return mustDecodeNode(n, enc, t.cachegen), nil
+	return nil, &MissingNodeError{NodeHash: hash, Path: prefix}
 }
 
 // Root returns the root hash of the trie.
author	Péter Szilágyi <peterke@gmail.com>	2018-06-21 17:28:05 +0800
committer	Felix Lange <fjl@users.noreply.github.com>	2018-06-21 17:28:05 +0800
commit	d926bf2c7e3182d694c15829a37a0ca7331cd03c (patch)
tree	c2d3ddd85941a231fb05de46c36703273d11814a /trie
parent	8db8d074e2fff547e9d85169018e03f89b5975a1 (diff)
download	dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.gz dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.bz2 dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.lz dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.xz dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.zst dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.zip