Merge branch 'release/2.026.21' into release/2.026.23

kobo-bot[bot] · kobo-bot[bot] · commit 228312859887 · 2026-06-12T12:26:53.000Z
diff --git a/dependencies/pip/dev_requirements.txt b/dependencies/pip/dev_requirements.txt
@@ -264,7 +264,7 @@ flake8-quotes==3.4.0
     # via -r dependencies/pip/dev_requirements.in
 flower==2.0.1
     # via -r dependencies/pip/requirements.in
-formpack @ git+https://github.com/kobotoolbox/formpack.git@0b20c04f957638df2caca3777419a933e53d9f4f#egg=formpack
+formpack @ git+https://github.com/kobotoolbox/formpack.git@7e92685993f76a14b7772363edd9882dc50da18f#egg=formpack
     # via -r dependencies/pip/requirements.in
 freezegun==1.5.5
     # via -r dependencies/pip/dev_requirements.in
diff --git a/dependencies/pip/requirements.in b/dependencies/pip/requirements.in
@@ -2,7 +2,7 @@
 # https://github.com/bndr/pipreqs is a handy utility, too.
 
 # formpack
-git+https://github.com/kobotoolbox/formpack.git@0b20c04f957638df2caca3777419a933e53d9f4f#egg=formpack
+git+https://github.com/kobotoolbox/formpack.git@7e92685993f76a14b7772363edd9882dc50da18f#egg=formpack
 
 # More up-to-date version of django-digest than PyPI seems to have.
 # Also, python-digest is an unlisted dependency thereof.
diff --git a/dependencies/pip/requirements.txt b/dependencies/pip/requirements.txt
@@ -210,7 +210,7 @@ fido2==2.2.0
     # via django-allauth
 flower==2.0.1
     # via -r dependencies/pip/requirements.in
-formpack @ git+https://github.com/kobotoolbox/formpack.git@0b20c04f957638df2caca3777419a933e53d9f4f#egg=formpack
+formpack @ git+https://github.com/kobotoolbox/formpack.git@7e92685993f76a14b7772363edd9882dc50da18f#egg=formpack
     # via -r dependencies/pip/requirements.in
 frozenlist==1.8.0
     # via
diff --git a/kobo/apps/languages/models/transcription.py b/kobo/apps/languages/models/transcription.py
@@ -28,11 +28,22 @@ def get_language_code(self, value: str) -> str:
                 service__code=self.code, region__code=value
             )
         except TranscriptionServiceLanguageM2M.DoesNotExist as err:
-            # Fall back on language itself and let the service detect the region.
+            # Check if value is itself a language code (e.g. 'en')
             if self.language_set.filter(code=value).exists():
                 return value
-            else:
-                raise LanguageNotSupported from err
+
+            # `value` is a region code (e.g. 'fr-BE') not explicitly in
+            # the DB. Strip the region suffix to get the parent language code
+            # (e.g. 'fr') and check whether that base language is supported.
+            # If it is, return the original region code unchanged so that
+            # Google STT receives the correct regional hint
+            parent_code = value.split('-')[0]
+            if parent_code != value and self.language_set.filter(
+                code=parent_code
+            ).exists():
+                return value
+
+            raise LanguageNotSupported from err
         else:
             return (
                 through_obj.mapping_code if through_obj.mapping_code else value
@@ -61,16 +72,25 @@ def get_configuration(self, value: str) -> TranscriptionServiceConfig:
         if through_obj:
             return self._build_config(through_obj)
 
-        if not self.language_set.filter(code=value).exists():
+        # `value` is a language code (e.g. 'fr') or a region-specific code
+        # (e.g. 'fr-BE'). If the exact code is not configured, fall back to the
+        # parent language ('fr') and use its stored configuration (model, location,
+        # etc.), so region-specific variants inherit their parent language settings
+        parent_code = value.split('-')[0]
+        lang_code = value if self.language_set.filter(code=value).exists() else (
+            parent_code if self.language_set.filter(code=parent_code).exists()
+            else None
+        )
+        if lang_code is None:
             raise LanguageNotSupported
 
-        candidates = list(queryset.filter(language__code=value))
+        candidates = list(queryset.filter(language__code=lang_code))
         if len(candidates) == 1:
             return self._build_config(candidates[0])
 
         if len(candidates) > 1:
             return self._build_config(
-                self._get_default_candidate(value, candidates)
+                self._get_default_candidate(lang_code, candidates)
             )
 
         raise LanguageNotSupported
diff --git a/kobo/apps/subsequences/integrations/google/base.py b/kobo/apps/subsequences/integrations/google/base.py
@@ -7,6 +7,7 @@
 import constance
 from django.conf import settings
 from django.core.cache import cache
+from google.api_core import client_options
 from google.api_core.operation import Operation
 from google.cloud import storage
 from googleapiclient import discovery
@@ -49,6 +50,9 @@ def __init__(self, submission: dict, asset: 'kpi.models.Asset', *args, **kwargs)
     def adapt_response(self, results: Any) -> str:
         pass
 
+    def get_client_options(self) -> Any:
+        return None
+
     @abstractmethod
     def begin_google_operation(
         self,
@@ -75,13 +79,8 @@ def handle_google_operation(
         # Fetch the latest update from Google API, but do not resend the same operation.
         cache_key = self._get_cache_key(xpath, source_lang, target_lang)
         if operation_name := cache.get(cache_key):
-            google_service = discovery.build(
-                self.API_NAME, self.API_VERSION, credentials=self.credentials
-            )
-            resource_path = self.API_RESOURCE.split('.')
-            for subresource in resource_path:
-                google_service = getattr(google_service, subresource)()
-            operation = google_service.get(name=operation_name).execute()
+            resource = self._get_discovery_resource()
+            operation = resource.get(name=operation_name).execute()
             if not (
                 operation.get('done') or operation.get('state') == 'SUCCEEDED'
             ):
@@ -117,15 +116,26 @@ def cancel_google_operation(self, operation_name: str) -> None:
         """
         Cancel a previously started Google long-running operation
         """
+        resource = self._get_discovery_resource()
+        resource.cancel(name=operation_name, body={}).execute()
+
+    def _get_discovery_resource(self):
+        opts = self.get_client_options()
+        if opts and opts.api_endpoint and not opts.api_endpoint.startswith('http'):
+            opts = client_options.ClientOptions(
+                api_endpoint=f'https://{opts.api_endpoint}'
+            )
+
         google_service = discovery.build(
             self.API_NAME,
             self.API_VERSION,
             credentials=self.credentials,
+            client_options=opts,
         )
         resource = google_service
         for subresource in self.API_RESOURCE.split('.'):
             resource = getattr(resource, subresource)()
-        resource.cancel(name=operation_name, body={}).execute()
+        return resource
 
     @abstractmethod
     def process_data(
diff --git a/kobo/apps/subsequences/integrations/google/google_transcribe.py b/kobo/apps/subsequences/integrations/google/google_transcribe.py
@@ -40,11 +40,17 @@
     TranscriptionResultNotFound
 )
 from .base import GoogleService
+from .locations import get_speech_location
 
 # https://cloud.google.com/speech-to-text/docs/quotas
 ASYNC_MAX_LENGTH = timedelta(minutes=479)
-DEFAULT_SPEECH_LOCATION = 'global'
-DEFAULT_SPEECH_MODEL = 'long'
+
+# Fallback STT model used when a language has no `model_code` set in the
+# `TranscriptionServiceLanguageM2M` database table. 'chirp_3' is chosen over
+# 'long' because it is available for every language in the 'us' and 'eu'
+# multi-region endpoints, and it supports all recognition features
+# (e.g. enable_automatic_punctuation)
+DEFAULT_SPEECH_MODEL = 'chirp_3'
 
 
 class GoogleTranscriptionService(GoogleService):
@@ -58,6 +64,7 @@ def __init__(self, submission: dict, asset: 'kpi.models.Asset', *args, **kwargs)
         class. It uses Google Cloud Speech-to-Text v2 batch API.
         """
         super().__init__(submission=submission, asset=asset, *args, **kwargs)
+        self.speech_location = get_speech_location()
 
     def adapt_response(self, response: Union[dict, list]) -> str:
         """
@@ -102,7 +109,6 @@ def begin_google_operation(
         target_lang: str,
         content: Any,
         *,
-        location_code: str | None = None,
         model_code: str | None = None,
     ) -> tuple[object, int]:
         """
@@ -115,27 +121,31 @@ def begin_google_operation(
                 'Audio file of duration %s is too long.' % duration
             )
 
-        speech_location = location_code or DEFAULT_SPEECH_LOCATION
         speech_model = model_code or DEFAULT_SPEECH_MODEL
-        speech_client = self._get_speech_client(speech_location)
+        speech_client = self._get_speech_client(self.speech_location)
         input_path, output_prefix = self._get_batch_paths(xpath, source_lang)
 
         logging.info(
             'Starting Google automatic transcription for '
             f'{self.submission_root_uuid=}, {xpath=}, {source_lang=}, '
-            f'{speech_location=}, {speech_model=}'
+            f'{self.speech_location=}, {speech_model=}'
         )
         self._cleanup_batch_files(xpath, source_lang)
         gcs_input_uri = self.store_file(flac_content, input_path)
 
         request = speech.BatchRecognizeRequest(
-            recognizer=self._get_recognizer_name(speech_location),
+            recognizer=self._get_recognizer_name(self.speech_location),
             config=speech.RecognitionConfig(
                 auto_decoding_config=speech.AutoDetectDecodingConfig(),
                 language_codes=[source_lang],
                 model=speech_model,
                 features=speech.RecognitionFeatures(
-                    enable_automatic_punctuation=True
+                    # chirp_3, chirp_2, and chirp support automatic punctuation
+                    # for all languages. 'long' does not support it for several
+                    # languages, including the 6 legacy African languages
+                    # (Kinyarwanda, Swati, Southern Sotho, Tswana, Tsonga, Venda),
+                    # and will return a 400 error if enabled
+                    enable_automatic_punctuation=(speech_model != 'long'),
                 ),
             ),
             files=[speech.BatchRecognizeFileMetadata(uri=gcs_input_uri)],
@@ -152,6 +162,11 @@ def begin_google_operation(
     def counter_name(self):
         return 'google_asr_seconds'
 
+    def get_client_options(self):
+        return client_options.ClientOptions(
+            api_endpoint=f'{self.speech_location}-speech.googleapis.com'
+        )
+
     def get_converted_audio(
         self, xpath: str, submission_uuid: int, user: object
     ) -> Union[bytes, tuple[bytes, timedelta]]:
@@ -227,7 +242,6 @@ def process_data(
                     source_lang=source_language,
                     target_lang=None,
                     content=converted_audio,
-                    location_code=language_config.location_code,
                     model_code=language_config.model_code,
                 )
             except AudioTooLongError as err:
@@ -306,7 +320,6 @@ def process_data(
             # read the batch result after Google reports completion
             operation_payload = self._get_operation_payload(
                 operation_name,
-                language_config.location_code,
             )
             if not operation_payload.get('done'):
                 raise SubsequenceTimeoutError
@@ -449,12 +462,12 @@ def _get_speech_client(self, location: str):
         """
         Create a Speech client bound to the configured regional endpoint
         """
-        client_kwargs = {'credentials': self.credentials}
-        if location != DEFAULT_SPEECH_LOCATION:
-            client_kwargs['client_options'] = client_options.ClientOptions(
+        return speech.SpeechClient(
+            credentials=self.credentials,
+            client_options=client_options.ClientOptions(
                 api_endpoint=f'{location}-speech.googleapis.com'
-            )
-        return speech.SpeechClient(**client_kwargs)
+            ),
+        )
 
     def _get_recognizer_name(self, location: str) -> str:
         """
@@ -468,14 +481,11 @@ def _get_recognizer_name(self, location: str) -> str:
     def _get_operation_payload(
         self,
         operation_name: str,
-        location_code: str | None = None,
     ) -> dict:
         """
         Poll the Google long-running operation backing the batch request.
         """
-        speech_client = self._get_speech_client(
-            location_code or DEFAULT_SPEECH_LOCATION
-        )
+        speech_client = self._get_speech_client(self.speech_location)
         operation = speech_client.transport.operations_client.get_operation(
             operation_name
         )
diff --git a/kobo/apps/subsequences/integrations/google/google_translate.py b/kobo/apps/subsequences/integrations/google/google_translate.py
@@ -9,6 +9,7 @@
 from django.apps import apps
 from django.conf import settings
 from django.core.cache import cache
+from google.api_core import client_options
 from google.api_core.exceptions import GoogleAPIError, InvalidArgument
 from google.cloud import translate_v3 as translate
 from google.cloud.exceptions import GoogleCloudError
@@ -22,6 +23,7 @@
 from ...exceptions import SubsequenceTimeoutError, TranslationResultNotFound
 from ..utils.google import google_credentials_from_constance_config
 from .base import GoogleService
+from .locations import get_translate_endpoint, get_translate_location
 
 
 class GoogleTranslationService(GoogleService):
@@ -41,16 +43,17 @@ def __init__(self, submission: dict, asset: 'kpi.models.Asset', *args, **kwargs)
         super().__init__(submission, asset, *args, **kwargs)
 
         self.translate_client = translate.TranslationServiceClient(
-            credentials=google_credentials_from_constance_config()
+            credentials=google_credentials_from_constance_config(),
+            client_options=client_options.ClientOptions(
+                api_endpoint=get_translate_endpoint()
+            ),
         )
+        translate_location = get_translate_location()
         self.translate_parent = (
-            f'projects/{constance.config.ASR_MT_GOOGLE_PROJECT_ID}'
-        )
-        # Google batch translation requires a concrete regional location
-        self.translate_async_parent = (
             f'projects/{constance.config.ASR_MT_GOOGLE_PROJECT_ID}/'
-            f'locations/{constance.config.ASR_MT_GOOGLE_TRANSLATION_LOCATION}'
+            f'locations/{translate_location}'
         )
+        self.translate_async_parent = self.translate_parent
         self.bucket_prefix = (
             constance.config.ASR_MT_GOOGLE_STORAGE_BUCKET_PREFIX
         )
@@ -71,6 +74,9 @@ def adapt_response(self, response: Any) -> str:
     def counter_name(self):
         return 'google_mt_characters'
 
+    def get_client_options(self):
+        return client_options.ClientOptions(api_endpoint=get_translate_endpoint())
+
     def begin_google_operation(
         self,
         xpath: str,
diff --git a/kobo/apps/subsequences/integrations/google/locations.py b/kobo/apps/subsequences/integrations/google/locations.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import constance
+
+from kpi.utils.log import logging
+
+
+# The server-wide Google region is stored in Constance as either 'US' or 'EU'.
+# 'EU' must be used when EU data residency is required (all data must remain
+# within Europe). 'US' is the default for all other deployments
+GOOGLE_REGION_EU = 'EU'
+GOOGLE_REGION_US = 'US'
+GOOGLE_REGION_CHOICES = (GOOGLE_REGION_US, GOOGLE_REGION_EU)
+
+DEFAULT_GOOGLE_REGION = GOOGLE_REGION_US
+
+# 'us' and 'eu' are STT v2 multi-region endpoints with identical language
+# support and model availability
+SPEECH_LOCATION_BY_REGION = {
+    GOOGLE_REGION_EU: 'eu',
+    GOOGLE_REGION_US: 'us',
+}
+
+# Translation requests are routed through multi-region endpoints:
+# `translate-eu.googleapis.com` keeps TLS termination and processing within the EU
+# for EU data residency requirements, while `translate-us.googleapis.com` routes
+# requests through the US multi-region
+TRANSLATE_ENDPOINT_BY_REGION = {
+    GOOGLE_REGION_EU: 'translate-eu.googleapis.com',
+    GOOGLE_REGION_US: 'translate-us.googleapis.com',
+}
+
+TRANSLATE_LOCATION_BY_REGION = {
+    GOOGLE_REGION_EU: 'europe-west1',
+    GOOGLE_REGION_US: 'us-west1',
+}
+
+
+def get_google_region() -> str:
+    """
+    Return the configured ASR/MT Google processing region ('US' or 'EU')
+
+    Reads ASR_MT_GOOGLE_REGION from constance at call time, so an admin can
+    change the region without restarting the server. Tolerates lower-case input
+    and falls back to 'US' with a warning if an unrecognised value is set
+    """
+    region = str(constance.config.ASR_MT_GOOGLE_REGION).upper()
+    if region in GOOGLE_REGION_CHOICES:
+        return region
+
+    logging.warning(
+        'Invalid ASR_MT_GOOGLE_REGION=%s; defaulting to %s',
+        region,
+        DEFAULT_GOOGLE_REGION,
+    )
+    return DEFAULT_GOOGLE_REGION
+
+
+def get_speech_location() -> str:
+    return SPEECH_LOCATION_BY_REGION[get_google_region()]
+
+
+def get_translate_endpoint() -> str:
+    return TRANSLATE_ENDPOINT_BY_REGION[get_google_region()]
+
+
+def get_translate_location() -> str:
+    return TRANSLATE_LOCATION_BY_REGION[get_google_region()]
diff --git a/kobo/apps/subsequences/integrations/tests/test_google_translate.py b/kobo/apps/subsequences/integrations/tests/test_google_translate.py
diff --git a/kobo/apps/subsequences/migrations/0012_migrate_google_region_constance_key.py b/kobo/apps/subsequences/migrations/0012_migrate_google_region_constance_key.py
diff --git a/kobo/settings/base.py b/kobo/settings/base.py
diff --git a/kpi/models/import_export_task.py b/kpi/models/import_export_task.py