Skip to content

Commit 4376b6d

Browse files
committed
removed mocked reponse and use a new exlucded exception in CB
Signed-off-by: Nikhil Suri <[email protected]>
1 parent bcd6760 commit 4376b6d

File tree

5 files changed

+157
-313
lines changed

5 files changed

+157
-313
lines changed

src/databricks/sql/exc.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,3 +131,19 @@ class CursorAlreadyClosedError(RequestError):
131131
class TelemetryRateLimitError(Exception):
132132
"""Raised when telemetry endpoint returns 429 or 503, indicating rate limiting or service unavailable.
133133
This exception is used exclusively by the circuit breaker to track telemetry rate limiting events."""
134+
135+
136+
class TelemetryNonRateLimitError(Exception):
137+
"""Wrapper for telemetry errors that should NOT trigger circuit breaker.
138+
139+
This exception wraps non-rate-limiting errors (network errors, timeouts, server errors, etc.)
140+
and is excluded from circuit breaker failure counting. Only TelemetryRateLimitError should
141+
open the circuit breaker.
142+
143+
Attributes:
144+
original_exception: The actual exception that occurred
145+
"""
146+
147+
def __init__(self, original_exception: Exception):
148+
self.original_exception = original_exception
149+
super().__init__(f"Non-rate-limit telemetry error: {original_exception}")

src/databricks/sql/telemetry/circuit_breaker_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import pybreaker
1414
from pybreaker import CircuitBreaker, CircuitBreakerError, CircuitBreakerListener
1515

16-
from databricks.sql.exc import TelemetryRateLimitError
16+
from databricks.sql.exc import TelemetryNonRateLimitError
1717

1818
logger = logging.getLogger(__name__)
1919

@@ -100,6 +100,9 @@ def get_circuit_breaker(cls, host: str) -> CircuitBreaker:
100100
fail_max=MINIMUM_CALLS,
101101
reset_timeout=RESET_TIMEOUT,
102102
name=f"{NAME_PREFIX}-{host}",
103+
exclude=[
104+
TelemetryNonRateLimitError
105+
], # Don't count these as failures
103106
)
104107
# Add state change listener for logging
105108
breaker.add_listener(CircuitBreakerStateListener())

src/databricks/sql/telemetry/telemetry_push_client.py

Lines changed: 93 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818

1919
from databricks.sql.common.unified_http_client import UnifiedHttpClient
2020
from databricks.sql.common.http import HttpMethod
21-
from databricks.sql.exc import TelemetryRateLimitError, RequestError
21+
from databricks.sql.exc import (
22+
TelemetryRateLimitError,
23+
TelemetryNonRateLimitError,
24+
RequestError,
25+
)
2226
from databricks.sql.telemetry.circuit_breaker_manager import CircuitBreakerManager
2327

2428
logger = logging.getLogger(__name__)
@@ -85,124 +89,113 @@ def __init__(self, delegate: ITelemetryPushClient, host: str):
8589
host,
8690
)
8791

88-
def _create_mock_success_response(self) -> BaseHTTPResponse:
89-
"""
90-
Create a mock success response for when circuit breaker is open.
91-
92-
This allows telemetry to fail silently without raising exceptions.
93-
"""
94-
# Create a simple object that mimics BaseHTTPResponse interface
95-
class _MockTelemetryResponse:
96-
"""Simple response object for silently handling circuit breaker state."""
97-
98-
status = 200
99-
# Include all required fields for TelemetryResponse dataclass
100-
data = b'{"numProtoSuccess": 0, "numSuccess": 0, "numRealtimeSuccess": 0, "errors": []}'
101-
102-
def close(self):
103-
pass
104-
105-
return _MockTelemetryResponse()
106-
107-
def request(
92+
def _make_request_and_check_status(
10893
self,
10994
method: HttpMethod,
11095
url: str,
111-
headers: Optional[Dict[str, str]] = None,
96+
headers: Optional[Dict[str, str]],
11297
**kwargs,
11398
) -> BaseHTTPResponse:
11499
"""
115-
Make an HTTP request with circuit breaker protection.
100+
Make the request and check response status.
101+
102+
Raises TelemetryRateLimitError for 429/503 (circuit breaker counts these).
103+
Wraps other errors in TelemetryNonRateLimitError (circuit breaker excludes these).
116104
117-
Circuit breaker only opens for 429/503 responses (rate limiting).
118-
If circuit breaker is open, silently drops the telemetry request.
119-
Other errors fail silently without triggering circuit breaker.
105+
Args:
106+
method: HTTP method
107+
url: Request URL
108+
headers: Request headers
109+
**kwargs: Additional request parameters
110+
111+
Returns:
112+
HTTP response
113+
114+
Raises:
115+
TelemetryRateLimitError: For 429/503 status codes (circuit breaker counts)
116+
TelemetryNonRateLimitError: For other errors (circuit breaker excludes)
120117
"""
118+
try:
119+
response = self._delegate.request(method, url, headers, **kwargs)
121120

122-
def _make_request_and_check_status():
123-
"""
124-
Function that makes the request and checks response status.
121+
# Check for rate limiting or service unavailable
122+
if response.status in [429, 503]:
123+
logger.warning(
124+
"Telemetry endpoint returned %d for host %s, triggering circuit breaker",
125+
response.status,
126+
self._host,
127+
)
128+
raise TelemetryRateLimitError(
129+
f"Telemetry endpoint rate limited or unavailable: {response.status}"
130+
)
125131

126-
Raises TelemetryRateLimitError ONLY for 429/503 so circuit breaker counts them as failures.
127-
For all other errors, returns mock success response so circuit breaker does NOT count them.
132+
return response
128133

129-
This ensures circuit breaker only opens for rate limiting, not for network errors,
130-
timeouts, or server errors.
131-
"""
132-
try:
133-
response = self._delegate.request(method, url, headers, **kwargs)
134+
except Exception as e:
135+
# Don't catch TelemetryRateLimitError - let it propagate to circuit breaker
136+
if isinstance(e, TelemetryRateLimitError):
137+
raise
138+
139+
# Check if it's a RequestError with rate limiting status code (exhausted retries)
140+
if isinstance(e, RequestError):
141+
http_code = (
142+
e.context.get("http-code")
143+
if hasattr(e, "context") and e.context
144+
else None
145+
)
134146

135-
# Check for rate limiting or service unavailable in successful response
136-
# (case where urllib3 returns response without exhausting retries)
137-
if response.status in [429, 503]:
147+
if http_code in [429, 503]:
138148
logger.warning(
139-
"Telemetry endpoint returned %d for host %s, triggering circuit breaker",
140-
response.status,
149+
"Telemetry retries exhausted with status %d for host %s, triggering circuit breaker",
150+
http_code,
141151
self._host,
142152
)
143153
raise TelemetryRateLimitError(
144-
f"Telemetry endpoint rate limited or unavailable: {response.status}"
154+
f"Telemetry rate limited after retries: {http_code}"
145155
)
146156

147-
return response
148-
149-
except Exception as e:
150-
# Don't catch TelemetryRateLimitError - let it propagate to circuit breaker
151-
if isinstance(e, TelemetryRateLimitError):
152-
raise
153-
154-
# Check if it's a RequestError with rate limiting status code (exhausted retries)
155-
if isinstance(e, RequestError):
156-
http_code = (
157-
e.context.get("http-code")
158-
if hasattr(e, "context") and e.context
159-
else None
160-
)
157+
# NOT rate limiting (500 errors, network errors, timeouts, etc.)
158+
# Wrap in TelemetryNonRateLimitError so circuit breaker excludes it
159+
logger.debug(
160+
"Non-rate-limit telemetry error for host %s: %s, wrapping to exclude from circuit breaker",
161+
self._host,
162+
e,
163+
)
164+
raise TelemetryNonRateLimitError(e) from e
161165

162-
if http_code in [429, 503]:
163-
logger.warning(
164-
"Telemetry retries exhausted with status %d for host %s, triggering circuit breaker",
165-
http_code,
166-
self._host,
167-
)
168-
raise TelemetryRateLimitError(
169-
f"Telemetry rate limited after retries: {http_code}"
170-
)
171-
172-
# NOT rate limiting (500 errors, network errors, timeouts, etc.)
173-
# Return mock success response so circuit breaker does NOT see this as a failure
174-
logger.debug(
175-
"Non-rate-limit telemetry error for host %s: %s, failing silently",
176-
self._host,
177-
e,
178-
)
179-
return self._create_mock_success_response()
166+
def request(
167+
self,
168+
method: HttpMethod,
169+
url: str,
170+
headers: Optional[Dict[str, str]] = None,
171+
**kwargs,
172+
) -> BaseHTTPResponse:
173+
"""
174+
Make an HTTP request with circuit breaker protection.
180175
176+
Circuit breaker only opens for TelemetryRateLimitError (429/503 responses).
177+
Other errors are wrapped in TelemetryNonRateLimitError and excluded from circuit breaker.
178+
All exceptions propagate to caller (TelemetryClient callback handles them).
179+
"""
181180
try:
182181
# Use circuit breaker to protect the request
183-
# The inner function will raise TelemetryRateLimitError for 429/503
184-
# which the circuit breaker will count as a failure
185-
return self._circuit_breaker.call(_make_request_and_check_status)
186-
187-
except Exception as e:
188-
# All telemetry errors are consumed and return mock success
189-
# Log appropriate message based on exception type
190-
if isinstance(e, CircuitBreakerError):
191-
logger.debug(
192-
"Circuit breaker is open for host %s, dropping telemetry request",
193-
self._host,
194-
)
195-
elif isinstance(e, TelemetryRateLimitError):
196-
logger.debug(
197-
"Telemetry rate limited for host %s (already counted by circuit breaker): %s",
198-
self._host,
199-
e,
200-
)
201-
else:
202-
logger.debug(
203-
"Unexpected telemetry error for host %s: %s, failing silently",
204-
self._host,
205-
e,
206-
)
207-
208-
return self._create_mock_success_response()
182+
# TelemetryRateLimitError will trigger circuit breaker
183+
# TelemetryNonRateLimitError is excluded from circuit breaker
184+
return self._circuit_breaker.call(
185+
self._make_request_and_check_status,
186+
method,
187+
url,
188+
headers,
189+
**kwargs,
190+
)
191+
192+
except TelemetryNonRateLimitError as e:
193+
# Unwrap and re-raise original exception
194+
# Circuit breaker didn't count this, but caller should handle it
195+
logger.debug(
196+
"Non-rate-limit telemetry error for host %s, re-raising original: %s",
197+
self._host,
198+
e.original_exception,
199+
)
200+
raise e.original_exception from e
201+
# All other exceptions (TelemetryRateLimitError, CircuitBreakerError) propagate as-is

0 commit comments

Comments
 (0)