diff --git a/src/core/load_balancing/pick_first/pick_first.cc b/src/core/load_balancing/pick_first/pick_first.cc index 8780c4911aed8b..7c017d0956b9e5 100644 --- a/src/core/load_balancing/pick_first/pick_first.cc +++ b/src/core/load_balancing/pick_first/pick_first.cc @@ -224,6 +224,7 @@ class PickFirst final : public LoadBalancingPolicy { void RequestConnectionWithTimer(); bool seen_transient_failure() const { return seen_transient_failure_; } + void set_seen_transient_failure() { seen_transient_failure_ = true; } private: // This method will be invoked once soon after instantiation to report @@ -826,9 +827,7 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( connectivity_state_ = new_state; connectivity_status_ = std::move(status); // Make sure we note when a subchannel has seen TRANSIENT_FAILURE. - bool prev_seen_transient_failure = seen_transient_failure_; if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { - seen_transient_failure_ = true; subchannel_list_->last_failure_ = connectivity_status_; } // If this is the initial connectivity state update for this subchannel, @@ -873,6 +872,8 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( // Otherwise, process connectivity state change. switch (*connectivity_state_) { case GRPC_CHANNEL_TRANSIENT_FAILURE: { + bool prev_seen_transient_failure = + std::exchange(seen_transient_failure_, true); // If this is the first failure we've seen on this subchannel, // then we're still in the Happy Eyeballs pass. if (!prev_seen_transient_failure && seen_transient_failure_) { @@ -1067,6 +1068,7 @@ void PickFirst::SubchannelList::StartConnectingNextSubchannel() { sc->RequestConnectionWithTimer(); return; } + sc->set_seen_transient_failure(); } // If we didn't find a subchannel to request a connection on, check to // see if the Happy Eyeballs pass is complete. @@ -1150,6 +1152,7 @@ class OldPickFirst final : public LoadBalancingPolicy { void ShutdownLocked(); bool seen_transient_failure() const { return seen_transient_failure_; } + void set_seen_transient_failure() { seen_transient_failure_ = true; } private: // Watcher for subchannel connectivity state. @@ -1720,10 +1723,8 @@ void OldPickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( ProcessUnselectedReadyLocked(); return; } - // Make sure we note when a subchannel has seen TRANSIENT_FAILURE. - bool prev_seen_transient_failure = seen_transient_failure_; + // Record status for TRANSIENT_FAILURE state. if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { - seen_transient_failure_ = true; subchannel_list_->last_failure_ = connectivity_status_; } // If this is the initial connectivity state update for this subchannel, @@ -1752,6 +1753,8 @@ void OldPickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( // Otherwise, process connectivity state change. switch (*connectivity_state_) { case GRPC_CHANNEL_TRANSIENT_FAILURE: { + bool prev_seen_transient_failure = + std::exchange(seen_transient_failure_, true); // If this is the first failure we've seen on this subchannel, // then we're still in the Happy Eyeballs pass. if (!prev_seen_transient_failure && seen_transient_failure_) { @@ -2004,6 +2007,7 @@ void OldPickFirst::SubchannelList::StartConnectingNextSubchannel() { sc->RequestConnectionWithTimer(); return; } + sc->set_seen_transient_failure(); } // If we didn't find a subchannel to request a connection on, check to // see if the Happy Eyeballs pass is complete. diff --git a/test/core/load_balancing/pick_first_test.cc b/test/core/load_balancing/pick_first_test.cc index a195235d185abd..bb974857020a04 100644 --- a/test/core/load_balancing/pick_first_test.cc +++ b/test/core/load_balancing/pick_first_test.cc @@ -1172,7 +1172,7 @@ TEST_F(PickFirstTest, AddressUpdateRetainsSelectedAddress) { // This exercizes a bug seen in the wild that caused a crash. For // details, see https://github.com/grpc/grpc/pull/38144. TEST_F(PickFirstTest, SubchannelNotificationAfterShutdown) { - // Send an update containing one address. + // Send an update containing two addresses. constexpr std::array kAddresses = { "ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444"}; absl::Status status = ApplyUpdate( @@ -1226,6 +1226,66 @@ TEST_F(PickFirstTest, SubchannelNotificationAfterShutdown) { subchannel->SetConnectivityState(GRPC_CHANNEL_IDLE); } +// This exercizes a bug seen in the wild that caused us to silently stop +// triggering connection attempts at the end of the Happy Eyeballs pass. +TEST_F(PickFirstTest, + SubchannelInitiallyReportsTransientFailureButIsIdleForHappyEyeballs) { + constexpr std::array kAddresses = { + "ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444"}; + // Pre-initialize the second subchannel to report TRANSIENT_FAILURE as + // its initial state. + auto* subchannel2 = CreateSubchannel(kAddresses[1]); + subchannel2->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE, + absl::UnavailableError("failed to connect"), + /*validate_state_transition=*/false); + // Send an update containing two addresses. + absl::Status status = ApplyUpdate( + BuildUpdate(kAddresses, MakePickFirstConfig(false)), lb_policy()); + EXPECT_TRUE(status.ok()) << status; + // LB policy should have created a subchannel for the first address. + auto* subchannel = FindSubchannel(kAddresses[0]); + ASSERT_NE(subchannel, nullptr); + // When the LB policy receives the first subchannel's initial connectivity + // state notification (IDLE), it will request a connection. + EXPECT_TRUE(subchannel->ConnectionRequested()); + // This causes the subchannel to start to connect, so it reports CONNECTING. + subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING); + // LB policy should have reported CONNECTING state. + ExpectConnectingUpdate(); + // Second subchannel finishes backoff. + subchannel2->SetConnectivityState(GRPC_CHANNEL_IDLE); + // No connection attempt triggered on the second subchannel yet. + EXPECT_FALSE(subchannel2->ConnectionRequested()); + // Now the Happy Eyeballs timer fires. + IncrementTimeBy(Duration::Milliseconds(250)); + // This triggers a connection attempt on the second subchannel. + EXPECT_TRUE(subchannel2->ConnectionRequested()); + // This causes the subchannel to start to connect, so it reports CONNECTING. + subchannel2->SetConnectivityState(GRPC_CHANNEL_CONNECTING); + // LB policy should have reported CONNECTING state. + ExpectConnectingUpdate(); + // Second subchannel fails immediately. + subchannel2->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE, + absl::UnavailableError("ugh")); + // Second subchannel finishes backoff. + subchannel2->SetConnectivityState(GRPC_CHANNEL_IDLE); + // Now the first subchannel fails. + subchannel->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE, + absl::UnavailableError("ugh2")); + // This should trigger an immediate re-attempt on the second subchannel. + EXPECT_TRUE(subchannel2->ConnectionRequested()); + // Subchannel should report CONNECTING. + subchannel2->SetConnectivityState(GRPC_CHANNEL_CONNECTING); + // The LB policy should request re-resolution. + ExpectReresolutionRequest(); + // The LB policy will report TRANSIENT_FAILURE. + WaitForConnectionFailed([&](const absl::Status& status) { + EXPECT_EQ(status, absl::UnavailableError( + "failed to connect to all addresses; " + "last error: UNAVAILABLE: ugh2")); + }); +} + TEST_F(PickFirstTest, WithShuffle) { constexpr std::array kAddresses = { "ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444", "ipv4:127.0.0.1:445",