From d8cb55d85d38181499e1cf6d409e69f3d81d1ba7 Mon Sep 17 00:00:00 2001 From: Tyler North Date: Mon, 21 Jul 2025 20:05:13 -0700 Subject: [PATCH 1/4] fix: add support for non-latin1 characters in wsgi module --- .../instrumentation/wsgi/__init__.py | 16 +++++++++++++++- .../tests/test_wsgi_middleware.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py index ecbc256287..1f93ac156e 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py @@ -219,6 +219,7 @@ def response_hook(span: Span, environ: WSGIEnvironment, status: str, response_he import functools import wsgiref.util as wsgiref_util +from urllib.parse import quote from timeit import default_timer from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, cast @@ -371,7 +372,20 @@ def collect_request_attributes( else: # old semconv v1.20.0 if _report_old(sem_conv_opt_in_mode): - result[HTTP_URL] = redact_url(wsgiref_util.request_uri(environ)) + try: + result[HTTP_URL] = redact_url(wsgiref_util.request_uri(environ)) + except UnicodeEncodeError: + # The underlying wsgiref library seems to hardcode latin1 into this call + # This can cause issues for some characters and you can hit decode errors + path_info = quote(environ.get("PATH_INFO", ""), safe="/;=,", encoding="utf-8", errors="replace") + scheme = environ.get("wsgi.url_scheme", "http") + host = environ.get("HTTP_HOST", environ.get("SERVER_NAME", "localhost")) + url = f"{scheme}://{host}{path_info}" + + if environ.get("QUERY_STRING"): + url += f"?{environ['QUERY_STRING']}" + + result[HTTP_URL] = redact_url(url) remote_addr = environ.get("REMOTE_ADDR") if remote_addr: diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py b/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py index 5a6e2d21f7..108cdf95f6 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py @@ -831,6 +831,21 @@ def test_remove_sensitive_params(self): expected.items(), ) + def test_unicode_path_info_is_utf8_encoded(self): + self.environ["HTTP_HOST"] = "mock" + self.environ["PATH_INFO"] = "/заказ" + self.environ["QUERY_STRING"] = "foo=bar" + + expected = { + HTTP_URL: "http://mock/%D0%B7%D0%B0%D0%BA%D0%B0%D0%B7?foo=bar", + NET_HOST_PORT: 80, + } + + self.assertGreaterEqual( + otel_wsgi.collect_request_attributes(self.environ).items(), + expected.items(), + ) + class TestWsgiMiddlewareWithTracerProvider(WsgiTestBase): def validate_response( From 96c8d16495eae8af822d5b0489a74b2641c15f73 Mon Sep 17 00:00:00 2001 From: Tyler North Date: Mon, 21 Jul 2025 20:12:04 -0700 Subject: [PATCH 2/4] fix: pre-commit run --- .../instrumentation/wsgi/__init__.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py index 1f93ac156e..35ef7e1aad 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py @@ -219,9 +219,9 @@ def response_hook(span: Span, environ: WSGIEnvironment, status: str, response_he import functools import wsgiref.util as wsgiref_util -from urllib.parse import quote from timeit import default_timer from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, cast +from urllib.parse import quote from opentelemetry import context, trace from opentelemetry.instrumentation._semconv import ( @@ -373,13 +373,22 @@ def collect_request_attributes( # old semconv v1.20.0 if _report_old(sem_conv_opt_in_mode): try: - result[HTTP_URL] = redact_url(wsgiref_util.request_uri(environ)) + result[HTTP_URL] = redact_url( + wsgiref_util.request_uri(environ) + ) except UnicodeEncodeError: # The underlying wsgiref library seems to hardcode latin1 into this call # This can cause issues for some characters and you can hit decode errors - path_info = quote(environ.get("PATH_INFO", ""), safe="/;=,", encoding="utf-8", errors="replace") + path_info = quote( + environ.get("PATH_INFO", ""), + safe="/;=,", + encoding="utf-8", + errors="replace", + ) scheme = environ.get("wsgi.url_scheme", "http") - host = environ.get("HTTP_HOST", environ.get("SERVER_NAME", "localhost")) + host = environ.get( + "HTTP_HOST", environ.get("SERVER_NAME", "localhost") + ) url = f"{scheme}://{host}{path_info}" if environ.get("QUERY_STRING"): From b0a5c0af00a74fa63ee5d3bba0233f8e6a1faed8 Mon Sep 17 00:00:00 2001 From: Tyler North Date: Tue, 22 Jul 2025 21:02:59 -0700 Subject: [PATCH 3/4] fix: try to mock what underlying libraries are doing --- .../instrumentation/wsgi/__init__.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py index 35ef7e1aad..8c66e05f43 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py @@ -372,29 +372,28 @@ def collect_request_attributes( else: # old semconv v1.20.0 if _report_old(sem_conv_opt_in_mode): + path_info = environ.get("PATH_INFO", "") + print("Path info", path_info) try: result[HTTP_URL] = redact_url( wsgiref_util.request_uri(environ) ) - except UnicodeEncodeError: - # The underlying wsgiref library seems to hardcode latin1 into this call - # This can cause issues for some characters and you can hit decode errors - path_info = quote( - environ.get("PATH_INFO", ""), - safe="/;=,", - encoding="utf-8", - errors="replace", + except UnicodeEncodeError as e: + url = wsgiref_util.application_uri(environ) + path = environ.get("PATH_INFO", "") + # Taken from repercent_broken_unicode function in django/utils/encoding + repercent = quote( + path[e.start : e.end], safe=b"/#%[]=:;$&()+,!?*@'~" ) - scheme = environ.get("wsgi.url_scheme", "http") - host = environ.get( - "HTTP_HOST", environ.get("SERVER_NAME", "localhost") - ) - url = f"{scheme}://{host}{path_info}" - + path = path[: e.start] + repercent.encode().decode() + # Most of this taken directly from original wsgiref library https://github.com/python/cpython/blob/bbe589f93ccaf32eb95fd9d1f8f3dc9a536e8db1/Lib/wsgiref/util.py#L61 + if not environ.get("SCRIPT_NAME"): + url += path[1:] + else: + url += path if environ.get("QUERY_STRING"): - url += f"?{environ['QUERY_STRING']}" - - result[HTTP_URL] = redact_url(url) + url += "?" + environ["QUERY_STRING"] + result[HTTP_URL] = url remote_addr = environ.get("REMOTE_ADDR") if remote_addr: From b60a0f299d67ca58dbe036f06e665d2b0f8aedc6 Mon Sep 17 00:00:00 2001 From: Tyler North Date: Wed, 23 Jul 2025 08:06:38 -0700 Subject: [PATCH 4/4] fix: remove random print --- .../src/opentelemetry/instrumentation/wsgi/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py index 8c66e05f43..7c16ff2b9b 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py @@ -373,7 +373,6 @@ def collect_request_attributes( # old semconv v1.20.0 if _report_old(sem_conv_opt_in_mode): path_info = environ.get("PATH_INFO", "") - print("Path info", path_info) try: result[HTTP_URL] = redact_url( wsgiref_util.request_uri(environ)