@@ -547,24 +547,24 @@ func (r *raft) send(m pb.Message) {
547
547
}
548
548
}
549
549
550
- // sendAppend sends an append RPC with new entries (if any) and the
551
- // current commit index to the given peer.
552
- func (r * raft ) sendAppend (to uint64 ) {
553
- r .maybeSendAppend (to , true )
554
- }
555
-
556
- // maybeSendAppend sends an append RPC with new entries to the given peer,
557
- // if necessary. Returns true if a message was sent. The sendIfEmpty
558
- // argument controls whether messages with no entries will be sent
559
- // ("empty" messages are useful to convey updated Commit indexes, but
560
- // are undesirable when we're sending multiple messages in a batch).
550
+ // maybeSendAppend sends an append RPC with log entries (if any) that are not
551
+ // yet known to be replicated in the given peer's log, as well as the current
552
+ // commit index. Usually it sends a MsgApp message, but in some cases (e.g. the
553
+ // log has been compacted) it can send a MsgSnap.
554
+ //
555
+ // In some cases, the MsgApp message can have zero entries, and yet being sent.
556
+ // When the follower log is not fully up-to-date, we must send a MsgApp
557
+ // periodically so that eventually the flow is either accepted or rejected. Not
558
+ // doing so can result in replication stall, in cases when a MsgApp is dropped.
561
559
//
562
- // TODO(pav-kv): make invocation of maybeSendAppend stateless. The Progress
563
- // struct contains all the state necessary for deciding whether to send a
564
- // message.
565
- func (r * raft ) maybeSendAppend (to uint64 , sendIfEmpty bool ) bool {
560
+ // Returns true if a message was sent, or false otherwise. A message is not sent
561
+ // if the follower log and commit index are up-to-date, the flow is paused (for
562
+ // reasons like in-flight limits), or the message could not be constructed .
563
+ func (r * raft ) maybeSendAppend (to uint64 ) bool {
566
564
pr := r .trk .Progress [to ]
567
- if pr .IsPaused () {
565
+
566
+ last , commit := r .raftLog .lastIndex (), r .raftLog .committed
567
+ if ! pr .ShouldSendMsgApp (last , commit ) {
568
568
return false
569
569
}
570
570
@@ -576,35 +576,25 @@ func (r *raft) maybeSendAppend(to uint64, sendIfEmpty bool) bool {
576
576
return r .maybeSendSnapshot (to , pr )
577
577
}
578
578
579
- var ents []pb.Entry
580
- // In a throttled StateReplicate only send empty MsgApp, to ensure progress.
581
- // Otherwise, if we had a full Inflights and all inflight messages were in
582
- // fact dropped, replication to that follower would stall. Instead, an empty
583
- // MsgApp will eventually reach the follower (heartbeats responses prompt the
584
- // leader to send an append), allowing it to be acked or rejected, both of
585
- // which will clear out Inflights.
586
- if pr .State != tracker .StateReplicate || ! pr .Inflights .Full () {
587
- ents , err = r .raftLog .entries (pr .Next , r .maxMsgSize )
588
- }
589
- if len (ents ) == 0 && ! sendIfEmpty {
590
- return false
591
- }
592
- // TODO(pav-kv): move this check up to where err is returned.
593
- if err != nil { // send a snapshot if we failed to get the entries
594
- return r .maybeSendSnapshot (to , pr )
579
+ var entries []pb.Entry
580
+ if pr .CanSendEntries (last ) {
581
+ if entries , err = r .raftLog .entries (pr .Next , r .maxMsgSize ); err != nil {
582
+ // Send a snapshot if we failed to get the entries.
583
+ return r .maybeSendSnapshot (to , pr )
584
+ }
595
585
}
596
586
597
- // Send the actual MsgApp otherwise , and update the progress accordingly.
587
+ // Send the MsgApp, and update the progress accordingly.
598
588
r .send (pb.Message {
599
589
To : to ,
600
590
Type : pb .MsgApp ,
601
591
Index : prevIndex ,
602
592
LogTerm : prevTerm ,
603
- Entries : ents ,
604
- Commit : r . raftLog . committed ,
593
+ Entries : entries ,
594
+ Commit : commit ,
605
595
})
606
- pr .SentEntries (len (ents ), uint64 (payloadsSize (ents )))
607
- pr .SentCommit (r . raftLog . committed )
596
+ pr .SentEntries (len (entries ), uint64 (payloadsSize (entries )))
597
+ pr .SentCommit (commit )
608
598
return true
609
599
}
610
600
@@ -662,7 +652,7 @@ func (r *raft) bcastAppend() {
662
652
if id == r .id {
663
653
return
664
654
}
665
- r .sendAppend (id )
655
+ r .maybeSendAppend (id )
666
656
})
667
657
}
668
658
@@ -1414,7 +1404,7 @@ func stepLeader(r *raft, m pb.Message) error {
1414
1404
if pr .State == tracker .StateReplicate {
1415
1405
pr .BecomeProbe ()
1416
1406
}
1417
- r .sendAppend (m .From )
1407
+ r .maybeSendAppend (m .From )
1418
1408
}
1419
1409
} else {
1420
1410
// We want to update our tracking if the response updates our
@@ -1450,21 +1440,13 @@ func stepLeader(r *raft, m pb.Message) error {
1450
1440
1451
1441
if r .maybeCommit () {
1452
1442
r .bcastAppend ()
1453
- } else if r .id != m .From && pr .CanBumpCommit (r .raftLog .committed ) {
1454
- // This node may be missing the latest commit index, so send it.
1455
- // NB: this is not strictly necessary because the periodic heartbeat
1456
- // messages deliver commit indices too. However, a message sent now
1457
- // may arrive earlier than the next heartbeat fires.
1458
- r .sendAppend (m .From )
1459
1443
}
1460
- // We've updated flow control information above, which may
1461
- // allow us to send multiple (size-limited) in-flight messages
1462
- // at once (such as when transitioning from probe to
1463
- // replicate, or when freeTo() covers multiple messages). If
1464
- // we have more entries to send, send as many messages as we
1465
- // can (without sending empty messages for the commit index)
1444
+ // We've updated flow control information above, which may allow us to
1445
+ // send multiple (size-limited) in-flight messages at once (such as when
1446
+ // transitioning from probe to replicate, or when freeTo() covers
1447
+ // multiple messages). Send as many messages as we can.
1466
1448
if r .id != m .From {
1467
- for r .maybeSendAppend (m .From , false /* sendIfEmpty */ ) {
1449
+ for r .maybeSendAppend (m .From ) {
1468
1450
}
1469
1451
}
1470
1452
// Transfer leadership is in progress.
@@ -1476,24 +1458,8 @@ func stepLeader(r *raft, m pb.Message) error {
1476
1458
}
1477
1459
case pb .MsgHeartbeatResp :
1478
1460
pr .RecentActive = true
1479
- pr .MsgAppFlowPaused = false
1480
-
1481
- // NB: if the follower is paused (full Inflights), this will still send an
1482
- // empty append, allowing it to recover from situations in which all the
1483
- // messages that filled up Inflights in the first place were dropped. Note
1484
- // also that the outgoing heartbeat already communicated the commit index.
1485
- //
1486
- // If the follower is fully caught up but also in StateProbe (as can happen
1487
- // if ReportUnreachable was called), we also want to send an append (it will
1488
- // be empty) to allow the follower to transition back to StateReplicate once
1489
- // it responds.
1490
- //
1491
- // Note that StateSnapshot typically satisfies pr.Match < lastIndex, but
1492
- // `pr.Paused()` is always true for StateSnapshot, so sendAppend is a
1493
- // no-op.
1494
- if pr .Match < r .raftLog .lastIndex () || pr .State == tracker .StateProbe {
1495
- r .sendAppend (m .From )
1496
- }
1461
+ pr .PauseMsgAppProbes (false )
1462
+ r .maybeSendAppend (m .From )
1497
1463
1498
1464
case pb .MsgSnapStatus :
1499
1465
if pr .State != tracker .StateSnapshot {
@@ -1549,7 +1515,8 @@ func stepLeader(r *raft, m pb.Message) error {
1549
1515
r .sendTimeoutNow (leadTransferee )
1550
1516
r .logger .Infof ("%x sends MsgTimeoutNow to %x immediately as %x already has up-to-date log" , r .id , leadTransferee , leadTransferee )
1551
1517
} else {
1552
- r .sendAppend (leadTransferee )
1518
+ pr .PauseMsgAppProbes (false )
1519
+ r .maybeSendAppend (leadTransferee )
1553
1520
}
1554
1521
}
1555
1522
return nil
@@ -1880,21 +1847,14 @@ func (r *raft) switchToConfig(cfg tracker.Config, trk tracker.ProgressMap) pb.Co
1880
1847
return cs
1881
1848
}
1882
1849
1883
- if r .maybeCommit () {
1884
- // If the configuration change means that more entries are committed now,
1885
- // broadcast/append to everyone in the updated config.
1886
- r .bcastAppend ()
1887
- } else {
1888
- // Otherwise, still probe the newly added replicas; there's no reason to
1889
- // let them wait out a heartbeat interval (or the next incoming
1890
- // proposal).
1891
- r .trk .Visit (func (id uint64 , pr * tracker.Progress ) {
1892
- if id == r .id {
1893
- return
1894
- }
1895
- r .maybeSendAppend (id , false /* sendIfEmpty */ )
1896
- })
1897
- }
1850
+ r .maybeCommit ()
1851
+ // If the configuration change means that more entries are committed now,
1852
+ // broadcast/append to everyone in the updated config.
1853
+ //
1854
+ // Otherwise, still probe the newly added replicas; there's no reason to let
1855
+ // them wait out a heartbeat interval (or the next incoming proposal).
1856
+ r .bcastAppend ()
1857
+
1898
1858
// If the leadTransferee was removed or demoted, abort the leadership transfer.
1899
1859
if _ , tOK := r .trk .Config .Voters .IDs ()[r .leadTransferee ]; ! tOK && r .leadTransferee != 0 {
1900
1860
r .abortLeaderTransfer ()
0 commit comments