Переглянути джерело

p2p/dnsdisc: re-check tree root when leaf resolution fails (#20682)

This adds additional logic to re-resolve the root name of a tree when a
couple of leaf requests have failed. We need this change to avoid
getting into a failure state where leaf requests keep failing for half
an hour when the tree has been updated.
Felix Lange 5 роки тому
батько
коміт
57d4898e29
2 змінених файлів з 113 додано та 18 видалено
  1. 57 9
      p2p/dnsdisc/client_test.go
  2. 56 9
      p2p/dnsdisc/sync.go

+ 57 - 9
p2p/dnsdisc/client_test.go

@@ -19,6 +19,7 @@ package dnsdisc
 import (
 	"context"
 	"crypto/ecdsa"
+	"errors"
 	"math/rand"
 	"reflect"
 	"testing"
@@ -176,11 +177,62 @@ func TestIteratorNodeUpdates(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	// sync the original tree.
+	// Sync the original tree.
 	resolver.add(tree1.ToTXT("n"))
 	checkIterator(t, it, nodes[:25])
 
-	// Update some nodes and ensure RandomNode returns the new nodes as well.
+	// Ensure RandomNode returns the new nodes after the tree is updated.
+	updateSomeNodes(nodesSeed1, nodes)
+	tree2, _ := makeTestTree("n", nodes, nil)
+	resolver.clear()
+	resolver.add(tree2.ToTXT("n"))
+	t.Log("tree updated")
+
+	clock.Run(c.cfg.RecheckInterval + 1*time.Second)
+	checkIterator(t, it, nodes)
+}
+
+// This test checks that the tree root is rechecked when a couple of leaf
+// requests have failed. The test is just like TestIteratorNodeUpdates, but
+// without advancing the clock by recheckInterval after the tree update.
+func TestIteratorRootRecheckOnFail(t *testing.T) {
+	var (
+		clock    = new(mclock.Simulated)
+		nodes    = testNodes(nodesSeed1, 30)
+		resolver = newMapResolver()
+		c        = NewClient(Config{
+			Resolver:        resolver,
+			Logger:          testlog.Logger(t, log.LvlTrace),
+			RecheckInterval: 20 * time.Minute,
+			RateLimit:       500,
+			// Disabling the cache is required for this test because the client doesn't
+			// notice leaf failures if all records are cached.
+			CacheLimit: 1,
+		})
+	)
+	c.clock = clock
+	tree1, url := makeTestTree("n", nodes[:25], nil)
+	it, err := c.NewIterator(url)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Sync the original tree.
+	resolver.add(tree1.ToTXT("n"))
+	checkIterator(t, it, nodes[:25])
+
+	// Ensure RandomNode returns the new nodes after the tree is updated.
+	updateSomeNodes(nodesSeed1, nodes)
+	tree2, _ := makeTestTree("n", nodes, nil)
+	resolver.clear()
+	resolver.add(tree2.ToTXT("n"))
+	t.Log("tree updated")
+
+	checkIterator(t, it, nodes)
+}
+
+// updateSomeNodes applies ENR updates to some of the given nodes.
+func updateSomeNodes(keySeed int64, nodes []*enode.Node) {
 	keys := testKeys(nodesSeed1, len(nodes))
 	for i, n := range nodes[:len(nodes)/2] {
 		r := n.Record()
@@ -190,11 +242,6 @@ func TestIteratorNodeUpdates(t *testing.T) {
 		n2, _ := enode.New(enode.ValidSchemes, r)
 		nodes[i] = n2
 	}
-	tree2, _ := makeTestTree("n", nodes, nil)
-	clock.Run(c.cfg.RecheckInterval + 1*time.Second)
-	resolver.clear()
-	resolver.add(tree2.ToTXT("n"))
-	checkIterator(t, it, nodes)
 }
 
 // This test verifies that randomIterator re-checks the root of the tree to catch
@@ -230,9 +277,10 @@ func TestIteratorLinkUpdates(t *testing.T) {
 	// Add link to tree3, remove link to tree2.
 	tree1, _ = makeTestTree("t1", nodes[:10], []string{url3})
 	resolver.add(tree1.ToTXT("t1"))
-	clock.Run(c.cfg.RecheckInterval + 1*time.Second)
 	t.Log("tree1 updated")
 
+	clock.Run(c.cfg.RecheckInterval + 1*time.Second)
+
 	var wantNodes []*enode.Node
 	wantNodes = append(wantNodes, tree1.Nodes()...)
 	wantNodes = append(wantNodes, tree3.Nodes()...)
@@ -345,5 +393,5 @@ func (mr mapResolver) LookupTXT(ctx context.Context, name string) ([]string, err
 	if record, ok := mr[name]; ok {
 		return []string{record}, nil
 	}
-	return nil, nil
+	return nil, errors.New("not found")
 }

+ 56 - 9
p2p/dnsdisc/sync.go

@@ -25,15 +25,22 @@ import (
 	"github.com/ethereum/go-ethereum/p2p/enode"
 )
 
+const (
+	rootRecheckFailCount = 5 // update root if this many leaf requests fail
+)
+
 // clientTree is a full tree being synced.
 type clientTree struct {
 	c   *Client
 	loc *linkEntry // link to this tree
 
 	lastRootCheck mclock.AbsTime // last revalidation of root
-	root          *rootEntry
-	enrs          *subtreeSync
-	links         *subtreeSync
+	leafFailCount int
+	rootFailCount int
+
+	root  *rootEntry
+	enrs  *subtreeSync
+	links *subtreeSync
 
 	lc         *linkCache          // tracks all links between all trees
 	curLinks   map[string]struct{} // links contained in this tree
@@ -46,7 +53,7 @@ func newClientTree(c *Client, lc *linkCache, loc *linkEntry) *clientTree {
 
 // syncAll retrieves all entries of the tree.
 func (ct *clientTree) syncAll(dest map[string]entry) error {
-	if err := ct.updateRoot(); err != nil {
+	if err := ct.updateRoot(context.Background()); err != nil {
 		return err
 	}
 	if err := ct.links.resolveAll(dest); err != nil {
@@ -60,12 +67,20 @@ func (ct *clientTree) syncAll(dest map[string]entry) error {
 
 // syncRandom retrieves a single entry of the tree. The Node return value
 // is non-nil if the entry was a node.
-func (ct *clientTree) syncRandom(ctx context.Context) (*enode.Node, error) {
+func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error) {
 	if ct.rootUpdateDue() {
-		if err := ct.updateRoot(); err != nil {
+		if err := ct.updateRoot(ctx); err != nil {
 			return nil, err
 		}
 	}
+
+	// Update fail counter for leaf request errors.
+	defer func() {
+		if err != nil {
+			ct.leafFailCount++
+		}
+	}()
+
 	// Link tree sync has priority, run it to completion before syncing ENRs.
 	if !ct.links.done() {
 		err := ct.syncNextLink(ctx)
@@ -138,15 +153,22 @@ func removeHash(h []string, index int) []string {
 }
 
 // updateRoot ensures that the given tree has an up-to-date root.
-func (ct *clientTree) updateRoot() error {
+func (ct *clientTree) updateRoot(ctx context.Context) error {
+	if !ct.slowdownRootUpdate(ctx) {
+		return ctx.Err()
+	}
+
 	ct.lastRootCheck = ct.c.clock.Now()
-	ctx, cancel := context.WithTimeout(context.Background(), ct.c.cfg.Timeout)
+	ctx, cancel := context.WithTimeout(ctx, ct.c.cfg.Timeout)
 	defer cancel()
 	root, err := ct.c.resolveRoot(ctx, ct.loc)
 	if err != nil {
+		ct.rootFailCount++
 		return err
 	}
 	ct.root = &root
+	ct.rootFailCount = 0
+	ct.leafFailCount = 0
 
 	// Invalidate subtrees if changed.
 	if ct.links == nil || root.lroot != ct.links.root {
@@ -161,7 +183,32 @@ func (ct *clientTree) updateRoot() error {
 
 // rootUpdateDue returns true when a root update is needed.
 func (ct *clientTree) rootUpdateDue() bool {
-	return ct.root == nil || time.Duration(ct.c.clock.Now()-ct.lastRootCheck) > ct.c.cfg.RecheckInterval
+	tooManyFailures := ct.leafFailCount > rootRecheckFailCount
+	scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval
+	return ct.root == nil || tooManyFailures || scheduledCheck
+}
+
+// slowdownRootUpdate applies a delay to root resolution if is tried
+// too frequently. This avoids busy polling when the client is offline.
+// Returns true if the timeout passed, false if sync was canceled.
+func (ct *clientTree) slowdownRootUpdate(ctx context.Context) bool {
+	var delay time.Duration
+	switch {
+	case ct.rootFailCount > 20:
+		delay = 10 * time.Second
+	case ct.rootFailCount > 5:
+		delay = 5 * time.Second
+	default:
+		return true
+	}
+	timeout := ct.c.clock.NewTimer(delay)
+	defer timeout.Stop()
+	select {
+	case <-timeout.C():
+		return true
+	case <-ctx.Done():
+		return false
+	}
 }
 
 // subtreeSync is the sync of an ENR or link subtree.