Skip to content

Commit 2283128

Browse files
author
kobo-bot[bot]
committed
Merge branch 'release/2.026.21' into release/2.026.23
2 parents 072086b + 59b6390 commit 2283128

12 files changed

Lines changed: 292 additions & 67 deletions

File tree

dependencies/pip/dev_requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ flake8-quotes==3.4.0
264264
# via -r dependencies/pip/dev_requirements.in
265265
flower==2.0.1
266266
# via -r dependencies/pip/requirements.in
267-
formpack @ git+https://github.com/kobotoolbox/formpack.git@0b20c04f957638df2caca3777419a933e53d9f4f#egg=formpack
267+
formpack @ git+https://github.com/kobotoolbox/formpack.git@7e92685993f76a14b7772363edd9882dc50da18f#egg=formpack
268268
# via -r dependencies/pip/requirements.in
269269
freezegun==1.5.5
270270
# via -r dependencies/pip/dev_requirements.in

dependencies/pip/requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# https://github.com/bndr/pipreqs is a handy utility, too.
33

44
# formpack
5-
git+https://github.com/kobotoolbox/formpack.git@0b20c04f957638df2caca3777419a933e53d9f4f#egg=formpack
5+
git+https://github.com/kobotoolbox/formpack.git@7e92685993f76a14b7772363edd9882dc50da18f#egg=formpack
66

77
# More up-to-date version of django-digest than PyPI seems to have.
88
# Also, python-digest is an unlisted dependency thereof.

dependencies/pip/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ fido2==2.2.0
210210
# via django-allauth
211211
flower==2.0.1
212212
# via -r dependencies/pip/requirements.in
213-
formpack @ git+https://github.com/kobotoolbox/formpack.git@0b20c04f957638df2caca3777419a933e53d9f4f#egg=formpack
213+
formpack @ git+https://github.com/kobotoolbox/formpack.git@7e92685993f76a14b7772363edd9882dc50da18f#egg=formpack
214214
# via -r dependencies/pip/requirements.in
215215
frozenlist==1.8.0
216216
# via

kobo/apps/languages/models/transcription.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,22 @@ def get_language_code(self, value: str) -> str:
2828
service__code=self.code, region__code=value
2929
)
3030
except TranscriptionServiceLanguageM2M.DoesNotExist as err:
31-
# Fall back on language itself and let the service detect the region.
31+
# Check if value is itself a language code (e.g. 'en')
3232
if self.language_set.filter(code=value).exists():
3333
return value
34-
else:
35-
raise LanguageNotSupported from err
34+
35+
# `value` is a region code (e.g. 'fr-BE') not explicitly in
36+
# the DB. Strip the region suffix to get the parent language code
37+
# (e.g. 'fr') and check whether that base language is supported.
38+
# If it is, return the original region code unchanged so that
39+
# Google STT receives the correct regional hint
40+
parent_code = value.split('-')[0]
41+
if parent_code != value and self.language_set.filter(
42+
code=parent_code
43+
).exists():
44+
return value
45+
46+
raise LanguageNotSupported from err
3647
else:
3748
return (
3849
through_obj.mapping_code if through_obj.mapping_code else value
@@ -61,16 +72,25 @@ def get_configuration(self, value: str) -> TranscriptionServiceConfig:
6172
if through_obj:
6273
return self._build_config(through_obj)
6374

64-
if not self.language_set.filter(code=value).exists():
75+
# `value` is a language code (e.g. 'fr') or a region-specific code
76+
# (e.g. 'fr-BE'). If the exact code is not configured, fall back to the
77+
# parent language ('fr') and use its stored configuration (model, location,
78+
# etc.), so region-specific variants inherit their parent language settings
79+
parent_code = value.split('-')[0]
80+
lang_code = value if self.language_set.filter(code=value).exists() else (
81+
parent_code if self.language_set.filter(code=parent_code).exists()
82+
else None
83+
)
84+
if lang_code is None:
6585
raise LanguageNotSupported
6686

67-
candidates = list(queryset.filter(language__code=value))
87+
candidates = list(queryset.filter(language__code=lang_code))
6888
if len(candidates) == 1:
6989
return self._build_config(candidates[0])
7090

7191
if len(candidates) > 1:
7292
return self._build_config(
73-
self._get_default_candidate(value, candidates)
93+
self._get_default_candidate(lang_code, candidates)
7494
)
7595

7696
raise LanguageNotSupported

kobo/apps/subsequences/integrations/google/base.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import constance
88
from django.conf import settings
99
from django.core.cache import cache
10+
from google.api_core import client_options
1011
from google.api_core.operation import Operation
1112
from google.cloud import storage
1213
from googleapiclient import discovery
@@ -49,6 +50,9 @@ def __init__(self, submission: dict, asset: 'kpi.models.Asset', *args, **kwargs)
4950
def adapt_response(self, results: Any) -> str:
5051
pass
5152

53+
def get_client_options(self) -> Any:
54+
return None
55+
5256
@abstractmethod
5357
def begin_google_operation(
5458
self,
@@ -75,13 +79,8 @@ def handle_google_operation(
7579
# Fetch the latest update from Google API, but do not resend the same operation.
7680
cache_key = self._get_cache_key(xpath, source_lang, target_lang)
7781
if operation_name := cache.get(cache_key):
78-
google_service = discovery.build(
79-
self.API_NAME, self.API_VERSION, credentials=self.credentials
80-
)
81-
resource_path = self.API_RESOURCE.split('.')
82-
for subresource in resource_path:
83-
google_service = getattr(google_service, subresource)()
84-
operation = google_service.get(name=operation_name).execute()
82+
resource = self._get_discovery_resource()
83+
operation = resource.get(name=operation_name).execute()
8584
if not (
8685
operation.get('done') or operation.get('state') == 'SUCCEEDED'
8786
):
@@ -117,15 +116,26 @@ def cancel_google_operation(self, operation_name: str) -> None:
117116
"""
118117
Cancel a previously started Google long-running operation
119118
"""
119+
resource = self._get_discovery_resource()
120+
resource.cancel(name=operation_name, body={}).execute()
121+
122+
def _get_discovery_resource(self):
123+
opts = self.get_client_options()
124+
if opts and opts.api_endpoint and not opts.api_endpoint.startswith('http'):
125+
opts = client_options.ClientOptions(
126+
api_endpoint=f'https://{opts.api_endpoint}'
127+
)
128+
120129
google_service = discovery.build(
121130
self.API_NAME,
122131
self.API_VERSION,
123132
credentials=self.credentials,
133+
client_options=opts,
124134
)
125135
resource = google_service
126136
for subresource in self.API_RESOURCE.split('.'):
127137
resource = getattr(resource, subresource)()
128-
resource.cancel(name=operation_name, body={}).execute()
138+
return resource
129139

130140
@abstractmethod
131141
def process_data(

kobo/apps/subsequences/integrations/google/google_transcribe.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,17 @@
4040
TranscriptionResultNotFound
4141
)
4242
from .base import GoogleService
43+
from .locations import get_speech_location
4344

4445
# https://cloud.google.com/speech-to-text/docs/quotas
4546
ASYNC_MAX_LENGTH = timedelta(minutes=479)
46-
DEFAULT_SPEECH_LOCATION = 'global'
47-
DEFAULT_SPEECH_MODEL = 'long'
47+
48+
# Fallback STT model used when a language has no `model_code` set in the
49+
# `TranscriptionServiceLanguageM2M` database table. 'chirp_3' is chosen over
50+
# 'long' because it is available for every language in the 'us' and 'eu'
51+
# multi-region endpoints, and it supports all recognition features
52+
# (e.g. enable_automatic_punctuation)
53+
DEFAULT_SPEECH_MODEL = 'chirp_3'
4854

4955

5056
class GoogleTranscriptionService(GoogleService):
@@ -58,6 +64,7 @@ def __init__(self, submission: dict, asset: 'kpi.models.Asset', *args, **kwargs)
5864
class. It uses Google Cloud Speech-to-Text v2 batch API.
5965
"""
6066
super().__init__(submission=submission, asset=asset, *args, **kwargs)
67+
self.speech_location = get_speech_location()
6168

6269
def adapt_response(self, response: Union[dict, list]) -> str:
6370
"""
@@ -102,7 +109,6 @@ def begin_google_operation(
102109
target_lang: str,
103110
content: Any,
104111
*,
105-
location_code: str | None = None,
106112
model_code: str | None = None,
107113
) -> tuple[object, int]:
108114
"""
@@ -115,27 +121,31 @@ def begin_google_operation(
115121
'Audio file of duration %s is too long.' % duration
116122
)
117123

118-
speech_location = location_code or DEFAULT_SPEECH_LOCATION
119124
speech_model = model_code or DEFAULT_SPEECH_MODEL
120-
speech_client = self._get_speech_client(speech_location)
125+
speech_client = self._get_speech_client(self.speech_location)
121126
input_path, output_prefix = self._get_batch_paths(xpath, source_lang)
122127

123128
logging.info(
124129
'Starting Google automatic transcription for '
125130
f'{self.submission_root_uuid=}, {xpath=}, {source_lang=}, '
126-
f'{speech_location=}, {speech_model=}'
131+
f'{self.speech_location=}, {speech_model=}'
127132
)
128133
self._cleanup_batch_files(xpath, source_lang)
129134
gcs_input_uri = self.store_file(flac_content, input_path)
130135

131136
request = speech.BatchRecognizeRequest(
132-
recognizer=self._get_recognizer_name(speech_location),
137+
recognizer=self._get_recognizer_name(self.speech_location),
133138
config=speech.RecognitionConfig(
134139
auto_decoding_config=speech.AutoDetectDecodingConfig(),
135140
language_codes=[source_lang],
136141
model=speech_model,
137142
features=speech.RecognitionFeatures(
138-
enable_automatic_punctuation=True
143+
# chirp_3, chirp_2, and chirp support automatic punctuation
144+
# for all languages. 'long' does not support it for several
145+
# languages, including the 6 legacy African languages
146+
# (Kinyarwanda, Swati, Southern Sotho, Tswana, Tsonga, Venda),
147+
# and will return a 400 error if enabled
148+
enable_automatic_punctuation=(speech_model != 'long'),
139149
),
140150
),
141151
files=[speech.BatchRecognizeFileMetadata(uri=gcs_input_uri)],
@@ -152,6 +162,11 @@ def begin_google_operation(
152162
def counter_name(self):
153163
return 'google_asr_seconds'
154164

165+
def get_client_options(self):
166+
return client_options.ClientOptions(
167+
api_endpoint=f'{self.speech_location}-speech.googleapis.com'
168+
)
169+
155170
def get_converted_audio(
156171
self, xpath: str, submission_uuid: int, user: object
157172
) -> Union[bytes, tuple[bytes, timedelta]]:
@@ -227,7 +242,6 @@ def process_data(
227242
source_lang=source_language,
228243
target_lang=None,
229244
content=converted_audio,
230-
location_code=language_config.location_code,
231245
model_code=language_config.model_code,
232246
)
233247
except AudioTooLongError as err:
@@ -306,7 +320,6 @@ def process_data(
306320
# read the batch result after Google reports completion
307321
operation_payload = self._get_operation_payload(
308322
operation_name,
309-
language_config.location_code,
310323
)
311324
if not operation_payload.get('done'):
312325
raise SubsequenceTimeoutError
@@ -449,12 +462,12 @@ def _get_speech_client(self, location: str):
449462
"""
450463
Create a Speech client bound to the configured regional endpoint
451464
"""
452-
client_kwargs = {'credentials': self.credentials}
453-
if location != DEFAULT_SPEECH_LOCATION:
454-
client_kwargs['client_options'] = client_options.ClientOptions(
465+
return speech.SpeechClient(
466+
credentials=self.credentials,
467+
client_options=client_options.ClientOptions(
455468
api_endpoint=f'{location}-speech.googleapis.com'
456-
)
457-
return speech.SpeechClient(**client_kwargs)
469+
),
470+
)
458471

459472
def _get_recognizer_name(self, location: str) -> str:
460473
"""
@@ -468,14 +481,11 @@ def _get_recognizer_name(self, location: str) -> str:
468481
def _get_operation_payload(
469482
self,
470483
operation_name: str,
471-
location_code: str | None = None,
472484
) -> dict:
473485
"""
474486
Poll the Google long-running operation backing the batch request.
475487
"""
476-
speech_client = self._get_speech_client(
477-
location_code or DEFAULT_SPEECH_LOCATION
478-
)
488+
speech_client = self._get_speech_client(self.speech_location)
479489
operation = speech_client.transport.operations_client.get_operation(
480490
operation_name
481491
)

kobo/apps/subsequences/integrations/google/google_translate.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from django.apps import apps
1010
from django.conf import settings
1111
from django.core.cache import cache
12+
from google.api_core import client_options
1213
from google.api_core.exceptions import GoogleAPIError, InvalidArgument
1314
from google.cloud import translate_v3 as translate
1415
from google.cloud.exceptions import GoogleCloudError
@@ -22,6 +23,7 @@
2223
from ...exceptions import SubsequenceTimeoutError, TranslationResultNotFound
2324
from ..utils.google import google_credentials_from_constance_config
2425
from .base import GoogleService
26+
from .locations import get_translate_endpoint, get_translate_location
2527

2628

2729
class GoogleTranslationService(GoogleService):
@@ -41,16 +43,17 @@ def __init__(self, submission: dict, asset: 'kpi.models.Asset', *args, **kwargs)
4143
super().__init__(submission, asset, *args, **kwargs)
4244

4345
self.translate_client = translate.TranslationServiceClient(
44-
credentials=google_credentials_from_constance_config()
46+
credentials=google_credentials_from_constance_config(),
47+
client_options=client_options.ClientOptions(
48+
api_endpoint=get_translate_endpoint()
49+
),
4550
)
51+
translate_location = get_translate_location()
4652
self.translate_parent = (
47-
f'projects/{constance.config.ASR_MT_GOOGLE_PROJECT_ID}'
48-
)
49-
# Google batch translation requires a concrete regional location
50-
self.translate_async_parent = (
5153
f'projects/{constance.config.ASR_MT_GOOGLE_PROJECT_ID}/'
52-
f'locations/{constance.config.ASR_MT_GOOGLE_TRANSLATION_LOCATION}'
54+
f'locations/{translate_location}'
5355
)
56+
self.translate_async_parent = self.translate_parent
5457
self.bucket_prefix = (
5558
constance.config.ASR_MT_GOOGLE_STORAGE_BUCKET_PREFIX
5659
)
@@ -71,6 +74,9 @@ def adapt_response(self, response: Any) -> str:
7174
def counter_name(self):
7275
return 'google_mt_characters'
7376

77+
def get_client_options(self):
78+
return client_options.ClientOptions(api_endpoint=get_translate_endpoint())
79+
7480
def begin_google_operation(
7581
self,
7682
xpath: str,
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from __future__ import annotations
2+
3+
import constance
4+
5+
from kpi.utils.log import logging
6+
7+
8+
# The server-wide Google region is stored in Constance as either 'US' or 'EU'.
9+
# 'EU' must be used when EU data residency is required (all data must remain
10+
# within Europe). 'US' is the default for all other deployments
11+
GOOGLE_REGION_EU = 'EU'
12+
GOOGLE_REGION_US = 'US'
13+
GOOGLE_REGION_CHOICES = (GOOGLE_REGION_US, GOOGLE_REGION_EU)
14+
15+
DEFAULT_GOOGLE_REGION = GOOGLE_REGION_US
16+
17+
# 'us' and 'eu' are STT v2 multi-region endpoints with identical language
18+
# support and model availability
19+
SPEECH_LOCATION_BY_REGION = {
20+
GOOGLE_REGION_EU: 'eu',
21+
GOOGLE_REGION_US: 'us',
22+
}
23+
24+
# Translation requests are routed through multi-region endpoints:
25+
# `translate-eu.googleapis.com` keeps TLS termination and processing within the EU
26+
# for EU data residency requirements, while `translate-us.googleapis.com` routes
27+
# requests through the US multi-region
28+
TRANSLATE_ENDPOINT_BY_REGION = {
29+
GOOGLE_REGION_EU: 'translate-eu.googleapis.com',
30+
GOOGLE_REGION_US: 'translate-us.googleapis.com',
31+
}
32+
33+
TRANSLATE_LOCATION_BY_REGION = {
34+
GOOGLE_REGION_EU: 'europe-west1',
35+
GOOGLE_REGION_US: 'us-west1',
36+
}
37+
38+
39+
def get_google_region() -> str:
40+
"""
41+
Return the configured ASR/MT Google processing region ('US' or 'EU')
42+
43+
Reads ASR_MT_GOOGLE_REGION from constance at call time, so an admin can
44+
change the region without restarting the server. Tolerates lower-case input
45+
and falls back to 'US' with a warning if an unrecognised value is set
46+
"""
47+
region = str(constance.config.ASR_MT_GOOGLE_REGION).upper()
48+
if region in GOOGLE_REGION_CHOICES:
49+
return region
50+
51+
logging.warning(
52+
'Invalid ASR_MT_GOOGLE_REGION=%s; defaulting to %s',
53+
region,
54+
DEFAULT_GOOGLE_REGION,
55+
)
56+
return DEFAULT_GOOGLE_REGION
57+
58+
59+
def get_speech_location() -> str:
60+
return SPEECH_LOCATION_BY_REGION[get_google_region()]
61+
62+
63+
def get_translate_endpoint() -> str:
64+
return TRANSLATE_ENDPOINT_BY_REGION[get_google_region()]
65+
66+
67+
def get_translate_location() -> str:
68+
return TRANSLATE_LOCATION_BY_REGION[get_google_region()]

0 commit comments

Comments
 (0)