Skip to content

Commit

Permalink
[FAB-18378] Log warning when peer is lagging behind and cannot catch up
Browse files Browse the repository at this point in the history
Fabric 2.2 made gossip state transfer be disabled by default.

When a peer is a gossip follower, it does not pull blocks from the ordering service,
and relies on the leader peer of its organization to forward blocks directly or indirectly.

However, if a peer misses a block and the rest of the peers in its organization advance too much,
it will never catch up because its state transfer is disabled.

This commit makes the peer log a warning if it is lagging behind, and its configuration
might prevent it from obtaining blocks via state transfer and the orderer.

Change-Id: Ie7bc67cb13cb2d8e7ffab638a49e0365535865c2
Signed-off-by: Yacov Manevich <yacovm@il.ibm.com>
(cherry picked from commit 5f61ff0)
  • Loading branch information
yacovm authored and Brett Logan committed Dec 14, 2020
1 parent d5d9965 commit 73b39dc
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 3 deletions.
6 changes: 6 additions & 0 deletions gossip/state/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ type StateConfig struct {
StateBlockBufferSize int
StateChannelSize int
StateEnabled bool
UseLeaderElection bool
OrgLeader bool
}

func GlobalConfig() *StateConfig {
Expand Down Expand Up @@ -68,4 +70,8 @@ func (c *StateConfig) loadStateConfig() {
if viper.IsSet("peer.gossip.state.enabled") {
c.StateEnabled = viper.GetBool("peer.gossip.state.enabled")
}
// The below two configuration parameters are used for straggler() which warns
// if our peer is lagging behind the rest and has no way to catch up.
c.UseLeaderElection = viper.GetBool("peer.gossip.useLeaderElection")
c.OrgLeader = viper.GetBool("peer.gossip.orgLeader")
}
22 changes: 20 additions & 2 deletions gossip/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ const (
defAntiEntropyStateResponseTimeout = 3 * time.Second
defAntiEntropyBatchSize = 10

defChannelBufferSize = 100
defAntiEntropyMaxRetries = 3
defChannelBufferSize = 100
defAntiEntropyMaxRetries = 3
stragglerWarningThreshold = 100

defMaxBlockDistance = 20

Expand Down Expand Up @@ -766,6 +767,13 @@ func (s *GossipStateProviderImpl) addPayload(payload *proto.Payload, blockingMod
}

if !blockingMode && payload.SeqNum-height >= uint64(s.config.StateBlockBufferSize) {
if s.straggler(height, payload) {
s.logger.Warningf("[%s] Current block height (%d) is too far behind other peers at height (%d) to be able to receive blocks "+
"without state transfer which is disabled in the configuration "+
"(peer.gossip.state.enabled = false). Consider enabling it or setting the peer explicitly to be a leader (peer.gossip.orgLeader = true) "+
"in order to pull blocks directly from the ordering service.",
s.chainID, height, payload.SeqNum+1)
}
return errors.Errorf("Ledger height is at %d, cannot enqueue block with sequence of %d", height, payload.SeqNum)
}

Expand All @@ -778,6 +786,16 @@ func (s *GossipStateProviderImpl) addPayload(payload *proto.Payload, blockingMod
return nil
}

func (s *GossipStateProviderImpl) straggler(currHeight uint64, receivedPayload *proto.Payload) bool {
// If state transfer is disabled, there is no way to request blocks from peers that their ledger has advanced too far.
stateDisabled := !s.config.StateEnabled
// We are too far behind if we received a block with a sequence number more than stragglerWarningThreshold ahead of our height.
tooFarBehind := currHeight+stragglerWarningThreshold < receivedPayload.SeqNum
// We depend on other peers for blocks if we use leader election, or we are not explicitly configured to be an org leader.
peerDependent := s.config.UseLeaderElection || !s.config.OrgLeader
return stateDisabled && tooFarBehind && peerDependent
}

func (s *GossipStateProviderImpl) commitBlock(block *common.Block, pvtData util.PvtDataCollections) error {

t1 := time.Now()
Expand Down
55 changes: 55 additions & 0 deletions gossip/state/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,61 @@ func newBootNode(id int, committer committer.Committer, acceptor peerIdentityAcc
return newPeerNodeWithGossipWithValidatorWithMetrics(logger, id, committer, acceptor, nil, v, gossipMetrics)
}

func TestStraggler(t *testing.T) {
for _, testCase := range []struct {
stateEnabled bool
orgLeader bool
leaderElection bool
height uint64
receivedSeq uint64
expected bool
}{
{
height: 100,
receivedSeq: 300,
leaderElection: true,
expected: true,
},
{
height: 100,
receivedSeq: 300,
expected: true,
},
{
height: 100,
receivedSeq: 300,
orgLeader: true,
},
{
height: 100,
receivedSeq: 105,
leaderElection: true,
},
{
height: 100,
receivedSeq: 300,
leaderElection: true,
stateEnabled: true,
},
} {
description := fmt.Sprintf("%+v", testCase)
t.Run(description, func(t *testing.T) {
s := &GossipStateProviderImpl{
config: &StateConfig{
StateEnabled: testCase.stateEnabled,
OrgLeader: testCase.orgLeader,
UseLeaderElection: testCase.leaderElection,
},
}

s.straggler(testCase.height, &proto.Payload{
SeqNum: testCase.receivedSeq,
})
})
}

}

func TestNilDirectMsg(t *testing.T) {
mc := &mockCommitter{Mock: &mock.Mock{}}
mc.On("LedgerHeight", mock.Anything).Return(uint64(1), nil)
Expand Down
8 changes: 7 additions & 1 deletion sampleconfig/core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,13 @@ peer:
# indicates whenever state transfer is enabled or not
# default value is true, i.e. state transfer is active
# and takes care to sync up missing blocks allowing
# lagging peer to catch up to speed with rest network
# lagging peer to catch up to speed with rest network.
# Keep in mind that when peer.gossip.useLeaderElection is true
# and there are several peers in the organization,
# or peer.gossip.useLeaderElection is false alongside with
# peer.gossip.orgleader being false, the peer's ledger may lag behind
# the rest of the peers and will never catch up due to state transfer
# being disabled.
enabled: false
# checkInterval interval to check whether peer is lagging behind enough to
# request blocks via state transfer from another peer.
Expand Down

0 comments on commit 73b39dc

Please sign in to comment.