From 64174f196feb1b7ad58eed329ef5cdbebce9834c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Mon, 25 May 2015 15:04:05 +0300 Subject: p2p/discover: add support for counting findnode failures --- p2p/discover/database.go | 17 ++++++++++++++--- p2p/discover/database_test.go | 11 +++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'p2p/discover') diff --git a/p2p/discover/database.go b/p2p/discover/database.go index 3a3f1254b..1b73c3dea 100644 --- a/p2p/discover/database.go +++ b/p2p/discover/database.go @@ -44,9 +44,10 @@ var ( nodeDBVersionKey = []byte("version") // Version of the database to flush if changes nodeDBItemPrefix = []byte("n:") // Identifier to prefix node entries with - nodeDBDiscoverRoot = ":discover" - nodeDBDiscoverPing = nodeDBDiscoverRoot + ":lastping" - nodeDBDiscoverPong = nodeDBDiscoverRoot + ":lastpong" + nodeDBDiscoverRoot = ":discover" + nodeDBDiscoverPing = nodeDBDiscoverRoot + ":lastping" + nodeDBDiscoverPong = nodeDBDiscoverRoot + ":lastpong" + nodeDBDiscoverFindFails = nodeDBDiscoverRoot + ":findfail" ) // newNodeDB creates a new node database for storing and retrieving infos about @@ -275,6 +276,16 @@ func (db *nodeDB) updateLastPong(id NodeID, instance time.Time) error { return db.storeInt64(makeKey(id, nodeDBDiscoverPong), instance.Unix()) } +// findFails retrieves the number of findnode failures since bonding. +func (db *nodeDB) findFails(id NodeID) int { + return int(db.fetchInt64(makeKey(id, nodeDBDiscoverFindFails))) +} + +// updateFindFails updates the number of findnode failures since bonding. +func (db *nodeDB) updateFindFails(id NodeID, fails int) error { + return db.storeInt64(makeKey(id, nodeDBDiscoverFindFails), int64(fails)) +} + // querySeeds retrieves a batch of nodes to be used as potential seed servers // during bootstrapping the node into the network. // diff --git a/p2p/discover/database_test.go b/p2p/discover/database_test.go index 88f5d2155..4fce164ca 100644 --- a/p2p/discover/database_test.go +++ b/p2p/discover/database_test.go @@ -93,6 +93,7 @@ func TestNodeDBFetchStore(t *testing.T) { 30303, ) inst := time.Now() + num := 314 db, _ := newNodeDB("", Version, NodeID{}) defer db.close() @@ -117,6 +118,16 @@ func TestNodeDBFetchStore(t *testing.T) { if stored := db.lastPong(node.ID); stored.Unix() != inst.Unix() { t.Errorf("pong: value mismatch: have %v, want %v", stored, inst) } + // Check fetch/store operations on a node findnode-failure object + if stored := db.findFails(node.ID); stored != 0 { + t.Errorf("find-node fails: non-existing object: %v", stored) + } + if err := db.updateFindFails(node.ID, num); err != nil { + t.Errorf("find-node fails: failed to update: %v", err) + } + if stored := db.findFails(node.ID); stored != num { + t.Errorf("find-node fails: value mismatch: have %v, want %v", stored, num) + } // Check fetch/store operations on an actual node object if stored := db.node(node.ID); stored != nil { t.Errorf("node: non-existing object: %v", stored) -- cgit v1.2.3 From 6078aa08ebf165e523cc00b67d89f0f946ba4fb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Mon, 25 May 2015 15:04:40 +0300 Subject: p2p/discover: watch find failures, evacuate on too many, rebond if failed --- p2p/discover/table.go | 55 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 8 deletions(-) (limited to 'p2p/discover') diff --git a/p2p/discover/table.go b/p2p/discover/table.go index b523a0684..c45143307 100644 --- a/p2p/discover/table.go +++ b/p2p/discover/table.go @@ -27,6 +27,7 @@ const ( nBuckets = hashBits + 1 // Number of buckets maxBondingPingPongs = 16 + maxFindnodeFailures = 5 ) type Table struct { @@ -198,7 +199,19 @@ func (tab *Table) Lookup(targetID NodeID) []*Node { asked[n.ID] = true pendingQueries++ go func() { - r, _ := tab.net.findnode(n.ID, n.addr(), targetID) + // Find potential neighbors to bond with + r, err := tab.net.findnode(n.ID, n.addr(), targetID) + if err != nil { + // Bump the failure counter to detect and evacuate non-bonded entries + fails := tab.db.findFails(n.ID) + 1 + tab.db.updateFindFails(n.ID, fails) + glog.V(logger.Detail).Infof("Bumping failures for %x: %d", n.ID[:8], fails) + + if fails > maxFindnodeFailures { + glog.V(logger.Detail).Infof("Evacuating node %x: %d findnode failures", n.ID[:8], fails) + tab.del(n) + } + } reply <- tab.bondall(r) }() } @@ -305,8 +318,15 @@ func (tab *Table) bondall(nodes []*Node) (result []*Node) { // If pinged is true, the remote node has just pinged us and one half // of the process can be skipped. func (tab *Table) bond(pinged bool, id NodeID, addr *net.UDPAddr, tcpPort uint16) (*Node, error) { - var n *Node - if n = tab.db.node(id); n == nil { + // Retrieve a previously known node and any recent findnode failures + node, fails := tab.db.node(id), 0 + if node != nil { + fails = tab.db.findFails(id) + } + // If the node is unknown (non-bonded) or failed (remotely unknown), bond from scratch + if node == nil || fails > 0 { + glog.V(logger.Detail).Infof("Bonding %x: known=%v, fails=%v", id[:8], node != nil, fails) + tab.bondmu.Lock() w := tab.bonding[id] if w != nil { @@ -325,18 +345,22 @@ func (tab *Table) bond(pinged bool, id NodeID, addr *net.UDPAddr, tcpPort uint16 delete(tab.bonding, id) tab.bondmu.Unlock() } - n = w.n + node = w.n if w.err != nil { return nil, w.err } } + // Bonding succeeded, add to the table and reset previous findnode failures tab.mutex.Lock() defer tab.mutex.Unlock() - b := tab.buckets[logdist(tab.self.sha, n.sha)] - if !b.bump(n) { - tab.pingreplace(n, b) + + b := tab.buckets[logdist(tab.self.sha, node.sha)] + if !b.bump(node) { + tab.pingreplace(node, b) } - return n, nil + tab.db.updateFindFails(id, 0) + + return node, nil } func (tab *Table) pingpong(w *bondproc, pinged bool, id NodeID, addr *net.UDPAddr, tcpPort uint16) { @@ -414,6 +438,21 @@ outer: } } +// del removes an entry from the node table (used to evacuate failed/non-bonded +// discovery peers). +func (tab *Table) del(node *Node) { + tab.mutex.Lock() + defer tab.mutex.Unlock() + + bucket := tab.buckets[logdist(tab.self.sha, node.sha)] + for i := range bucket.entries { + if bucket.entries[i].ID == node.ID { + bucket.entries = append(bucket.entries[:i], bucket.entries[i+1:]...) + return + } + } +} + func (b *bucket) bump(n *Node) bool { for i := range b.entries { if b.entries[i].ID == n.ID { -- cgit v1.2.3 From 5076170f344afb970c1adc91429ced6ce93b5989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Mon, 25 May 2015 15:22:54 +0300 Subject: p2p/discover: permit temporary bond failures for previously known nodes --- p2p/discover/table.go | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'p2p/discover') diff --git a/p2p/discover/table.go b/p2p/discover/table.go index c45143307..ee1d58cae 100644 --- a/p2p/discover/table.go +++ b/p2p/discover/table.go @@ -324,6 +324,7 @@ func (tab *Table) bond(pinged bool, id NodeID, addr *net.UDPAddr, tcpPort uint16 fails = tab.db.findFails(id) } // If the node is unknown (non-bonded) or failed (remotely unknown), bond from scratch + var result error if node == nil || fails > 0 { glog.V(logger.Detail).Infof("Bonding %x: known=%v, fails=%v", id[:8], node != nil, fails) @@ -345,22 +346,24 @@ func (tab *Table) bond(pinged bool, id NodeID, addr *net.UDPAddr, tcpPort uint16 delete(tab.bonding, id) tab.bondmu.Unlock() } - node = w.n - if w.err != nil { - return nil, w.err + // Retrieve the bonding results + result = w.err + if result == nil { + node = w.n } } - // Bonding succeeded, add to the table and reset previous findnode failures - tab.mutex.Lock() - defer tab.mutex.Unlock() + // Even if bonding temporarily failed, give the node a chance + if node != nil { + tab.mutex.Lock() + defer tab.mutex.Unlock() - b := tab.buckets[logdist(tab.self.sha, node.sha)] - if !b.bump(node) { - tab.pingreplace(node, b) + b := tab.buckets[logdist(tab.self.sha, node.sha)] + if !b.bump(node) { + tab.pingreplace(node, b) + } + tab.db.updateFindFails(id, 0) } - tab.db.updateFindFails(id, 0) - - return node, nil + return node, result } func (tab *Table) pingpong(w *bondproc, pinged bool, id NodeID, addr *net.UDPAddr, tcpPort uint16) { -- cgit v1.2.3 From f539ed1e6647cb27d873ae12e48db66df93df34a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Mon, 25 May 2015 15:57:44 +0300 Subject: p2p/discover: force refresh if the table is empty --- p2p/discover/table.go | 54 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 13 deletions(-) (limited to 'p2p/discover') diff --git a/p2p/discover/table.go b/p2p/discover/table.go index ee1d58cae..38bdea0ca 100644 --- a/p2p/discover/table.go +++ b/p2p/discover/table.go @@ -191,6 +191,12 @@ func (tab *Table) Lookup(targetID NodeID) []*Node { result := tab.closest(target, bucketSize) tab.mutex.Unlock() + // If the result set is empty, all nodes were dropped, refresh + if len(result.entries) == 0 { + tab.refresh() + return nil + } + for { // ask the alpha closest nodes that we haven't asked yet for i := 0; i < len(result.entries) && pendingQueries < alpha; i++ { @@ -207,7 +213,7 @@ func (tab *Table) Lookup(targetID NodeID) []*Node { tab.db.updateFindFails(n.ID, fails) glog.V(logger.Detail).Infof("Bumping failures for %x: %d", n.ID[:8], fails) - if fails > maxFindnodeFailures { + if fails >= maxFindnodeFailures { glog.V(logger.Detail).Infof("Evacuating node %x: %d findnode failures", n.ID[:8], fails) tab.del(n) } @@ -232,19 +238,41 @@ func (tab *Table) Lookup(targetID NodeID) []*Node { return result.entries } -// refresh performs a lookup for a random target to keep buckets full. +// refresh performs a lookup for a random target to keep buckets full, or seeds +// the table if it is empty (initial bootstrap or discarded faulty peers). func (tab *Table) refresh() { - // The Kademlia paper specifies that the bucket refresh should - // perform a refresh in the least recently used bucket. We cannot - // adhere to this because the findnode target is a 512bit value - // (not hash-sized) and it is not easily possible to generate a - // sha3 preimage that falls into a chosen bucket. - // - // We perform a lookup with a random target instead. - var target NodeID - rand.Read(target[:]) - result := tab.Lookup(target) - if len(result) == 0 { + seed := true + + // If the discovery table is empty, seed with previously known nodes + tab.mutex.Lock() + for _, bucket := range tab.buckets { + if len(bucket.entries) > 0 { + seed = false + break + } + } + tab.mutex.Unlock() + + // If the table is not empty, try to refresh using the live entries + if !seed { + // The Kademlia paper specifies that the bucket refresh should + // perform a refresh in the least recently used bucket. We cannot + // adhere to this because the findnode target is a 512bit value + // (not hash-sized) and it is not easily possible to generate a + // sha3 preimage that falls into a chosen bucket. + // + // We perform a lookup with a random target instead. + var target NodeID + rand.Read(target[:]) + + result := tab.Lookup(target) + if len(result) == 0 { + // Lookup failed, seed after all + seed = true + } + } + + if seed { // Pick a batch of previously know seeds to lookup with seeds := tab.db.querySeeds(10) for _, seed := range seeds { -- cgit v1.2.3 From 3630432dfb486a5bf8f853900be1fe32fefae182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Mon, 25 May 2015 16:23:16 +0300 Subject: p2p/discovery: fix a cornercase loop if no seeds or bootnodes are known --- p2p/discover/table.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'p2p/discover') diff --git a/p2p/discover/table.go b/p2p/discover/table.go index 38bdea0ca..7bff0d1d0 100644 --- a/p2p/discover/table.go +++ b/p2p/discover/table.go @@ -278,12 +278,16 @@ func (tab *Table) refresh() { for _, seed := range seeds { glog.V(logger.Debug).Infoln("Seeding network with", seed) } + peers := append(tab.nursery, seeds...) + // Bootstrap the table with a self lookup - all := tab.bondall(append(tab.nursery, seeds...)) - tab.mutex.Lock() - tab.add(all) - tab.mutex.Unlock() - tab.Lookup(tab.self.ID) + if len(peers) > 0 { + tab.mutex.Lock() + tab.add(peers) + tab.mutex.Unlock() + + tab.Lookup(tab.self.ID) + } // TODO: the Kademlia paper says that we're supposed to perform // random lookups in all buckets further away than our closest neighbor. } -- cgit v1.2.3 From 612f01400f59b0b4d0db9f9ceaa38f45805ea89e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Mon, 25 May 2015 16:35:32 +0300 Subject: p2p/discover: bond with seed nodes too (runs only if findnode failed) --- p2p/discover/table.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'p2p/discover') diff --git a/p2p/discover/table.go b/p2p/discover/table.go index 7bff0d1d0..4b7ddb775 100644 --- a/p2p/discover/table.go +++ b/p2p/discover/table.go @@ -278,14 +278,11 @@ func (tab *Table) refresh() { for _, seed := range seeds { glog.V(logger.Debug).Infoln("Seeding network with", seed) } - peers := append(tab.nursery, seeds...) - - // Bootstrap the table with a self lookup - if len(peers) > 0 { - tab.mutex.Lock() - tab.add(peers) - tab.mutex.Unlock() + nodes := append(tab.nursery, seeds...) + // Bond with all the seed nodes (will pingpong only if failed recently) + bonded := tab.bondall(nodes) + if len(bonded) > 0 { tab.Lookup(tab.self.ID) } // TODO: the Kademlia paper says that we're supposed to perform -- cgit v1.2.3