Skip to content

Commit

Permalink
[pick_first] fix bug that caused us to stop triggering connection att…
Browse files Browse the repository at this point in the history
…empts
  • Loading branch information
markdroth committed Feb 11, 2025
1 parent 8fb4bd5 commit b5b51f3
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 6 deletions.
14 changes: 9 additions & 5 deletions src/core/load_balancing/pick_first/pick_first.cc
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ class PickFirst final : public LoadBalancingPolicy {
void RequestConnectionWithTimer();

bool seen_transient_failure() const { return seen_transient_failure_; }
void set_seen_transient_failure() { seen_transient_failure_ = true; }

private:
// This method will be invoked once soon after instantiation to report
Expand Down Expand Up @@ -826,9 +827,7 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange(
connectivity_state_ = new_state;
connectivity_status_ = std::move(status);
// Make sure we note when a subchannel has seen TRANSIENT_FAILURE.
bool prev_seen_transient_failure = seen_transient_failure_;
if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) {
seen_transient_failure_ = true;
subchannel_list_->last_failure_ = connectivity_status_;
}
// If this is the initial connectivity state update for this subchannel,
Expand Down Expand Up @@ -873,6 +872,8 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange(
// Otherwise, process connectivity state change.
switch (*connectivity_state_) {
case GRPC_CHANNEL_TRANSIENT_FAILURE: {
bool prev_seen_transient_failure =
std::exchange(seen_transient_failure_, true);
// If this is the first failure we've seen on this subchannel,
// then we're still in the Happy Eyeballs pass.
if (!prev_seen_transient_failure && seen_transient_failure_) {
Expand Down Expand Up @@ -1067,6 +1068,7 @@ void PickFirst::SubchannelList::StartConnectingNextSubchannel() {
sc->RequestConnectionWithTimer();
return;
}
sc->set_seen_transient_failure();
}
// If we didn't find a subchannel to request a connection on, check to
// see if the Happy Eyeballs pass is complete.
Expand Down Expand Up @@ -1150,6 +1152,7 @@ class OldPickFirst final : public LoadBalancingPolicy {
void ShutdownLocked();

bool seen_transient_failure() const { return seen_transient_failure_; }
void set_seen_transient_failure() { seen_transient_failure_ = true; }

private:
// Watcher for subchannel connectivity state.
Expand Down Expand Up @@ -1720,10 +1723,8 @@ void OldPickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange(
ProcessUnselectedReadyLocked();
return;
}
// Make sure we note when a subchannel has seen TRANSIENT_FAILURE.
bool prev_seen_transient_failure = seen_transient_failure_;
// Record status for TRANSIENT_FAILURE state.
if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) {
seen_transient_failure_ = true;
subchannel_list_->last_failure_ = connectivity_status_;
}
// If this is the initial connectivity state update for this subchannel,
Expand Down Expand Up @@ -1752,6 +1753,8 @@ void OldPickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange(
// Otherwise, process connectivity state change.
switch (*connectivity_state_) {
case GRPC_CHANNEL_TRANSIENT_FAILURE: {
bool prev_seen_transient_failure =
std::exchange(seen_transient_failure_, true);
// If this is the first failure we've seen on this subchannel,
// then we're still in the Happy Eyeballs pass.
if (!prev_seen_transient_failure && seen_transient_failure_) {
Expand Down Expand Up @@ -2004,6 +2007,7 @@ void OldPickFirst::SubchannelList::StartConnectingNextSubchannel() {
sc->RequestConnectionWithTimer();
return;
}
sc->set_seen_transient_failure();
}
// If we didn't find a subchannel to request a connection on, check to
// see if the Happy Eyeballs pass is complete.
Expand Down
62 changes: 61 additions & 1 deletion test/core/load_balancing/pick_first_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1172,7 +1172,7 @@ TEST_F(PickFirstTest, AddressUpdateRetainsSelectedAddress) {
// This exercizes a bug seen in the wild that caused a crash. For
// details, see https://github.com/grpc/grpc/pull/38144.
TEST_F(PickFirstTest, SubchannelNotificationAfterShutdown) {
// Send an update containing one address.
// Send an update containing two addresses.
constexpr std::array<absl::string_view, 2> kAddresses = {
"ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444"};
absl::Status status = ApplyUpdate(
Expand Down Expand Up @@ -1226,6 +1226,66 @@ TEST_F(PickFirstTest, SubchannelNotificationAfterShutdown) {
subchannel->SetConnectivityState(GRPC_CHANNEL_IDLE);
}

// This exercizes a bug seen in the wild that caused us to silently stop
// triggering connection attempts at the end of the Happy Eyeballs pass.
TEST_F(PickFirstTest,
SubchannelInitiallyReportsTransientFailureButIsIdleForHappyEyeballs) {
constexpr std::array<absl::string_view, 2> kAddresses = {
"ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444"};
// Pre-initialize the second subchannel to report TRANSIENT_FAILURE as
// its initial state.
auto* subchannel2 = CreateSubchannel(kAddresses[1]);
subchannel2->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE,
absl::UnavailableError("failed to connect"),
/*validate_state_transition=*/false);
// Send an update containing two addresses.
absl::Status status = ApplyUpdate(
BuildUpdate(kAddresses, MakePickFirstConfig(false)), lb_policy());
EXPECT_TRUE(status.ok()) << status;
// LB policy should have created a subchannel for the first address.
auto* subchannel = FindSubchannel(kAddresses[0]);
ASSERT_NE(subchannel, nullptr);
// When the LB policy receives the first subchannel's initial connectivity
// state notification (IDLE), it will request a connection.
EXPECT_TRUE(subchannel->ConnectionRequested());
// This causes the subchannel to start to connect, so it reports CONNECTING.
subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING);
// LB policy should have reported CONNECTING state.
ExpectConnectingUpdate();
// Second subchannel finishes backoff.
subchannel2->SetConnectivityState(GRPC_CHANNEL_IDLE);
// No connection attempt triggered on the second subchannel yet.
EXPECT_FALSE(subchannel2->ConnectionRequested());
// Now the Happy Eyeballs timer fires.
IncrementTimeBy(Duration::Milliseconds(250));
// This triggers a connection attempt on the second subchannel.
EXPECT_TRUE(subchannel2->ConnectionRequested());
// This causes the subchannel to start to connect, so it reports CONNECTING.
subchannel2->SetConnectivityState(GRPC_CHANNEL_CONNECTING);
// LB policy should have reported CONNECTING state.
ExpectConnectingUpdate();
// Second subchannel fails immediately.
subchannel2->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE,
absl::UnavailableError("ugh"));
// Second subchannel finishes backoff.
subchannel2->SetConnectivityState(GRPC_CHANNEL_IDLE);
// Now the first subchannel fails.
subchannel->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE,
absl::UnavailableError("ugh2"));
// This should trigger an immediate re-attempt on the second subchannel.
EXPECT_TRUE(subchannel2->ConnectionRequested());
// Subchannel should report CONNECTING.
subchannel2->SetConnectivityState(GRPC_CHANNEL_CONNECTING);
// The LB policy should request re-resolution.
ExpectReresolutionRequest();
// The LB policy will report TRANSIENT_FAILURE.
WaitForConnectionFailed([&](const absl::Status& status) {
EXPECT_EQ(status, absl::UnavailableError(
"failed to connect to all addresses; "
"last error: UNAVAILABLE: ugh2"));
});
}

TEST_F(PickFirstTest, WithShuffle) {
constexpr std::array<absl::string_view, 6> kAddresses = {
"ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444", "ipv4:127.0.0.1:445",
Expand Down

0 comments on commit b5b51f3

Please sign in to comment.