Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 184 additions & 8 deletions src/rev_ai/apiclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
"""Speech recognition tools for using Rev AI"""

import json
from .models import Account, CaptionType, Job, Transcript
from .baseclient import BaseClient

from . import utils
from .baseclient import BaseClient
from .models import Account, CaptionType, Job, Transcript
from .models.asynchronous.summarization_options import SummarizationOptions
from .models.asynchronous.summary import Summary
from .models.asynchronous.translation_options import TranslationOptions

try:
from urllib.parse import urljoin
Expand Down Expand Up @@ -66,7 +70,9 @@ def submit_job_url(
notification_config=None,
skip_postprocessing=False,
remove_atmospherics=False,
speakers_count=None):
speakers_count=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like we are missing to pass these parameters to _create_job_options_payload

remove_atmospherics=False,
speakers_count=None,

@amikofalvy do you know whether it is intentional?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dmtrrk We just haven't supported them in SDKs yet - there's an open PR for it -#104

summarization_config: SummarizationOptions = None,
translation_config: TranslationOptions = None):
"""Submit media given a URL for transcription.
The audio data is downloaded from the URL
:param media_url: web location of the media file
Expand Down Expand Up @@ -116,6 +122,8 @@ def submit_job_url(
:param remove_atmospherics: Atmospherics such as <laugh>, <affirmative>, etc. will not
appear in the transcript.
:param speakers_count: Use to specify the total number of unique speakers in the audio.
:param summarization_config: Use to request transcript summary.
:param translation_config: Use to request transcript translation.
:returns: raw response data
:raises: HTTPError
"""
Expand All @@ -128,7 +136,9 @@ def submit_job_url(
verbatim, rush, test_mode,
segments_to_transcribe, speaker_names,
source_config, notification_config,
skip_postprocessing)
skip_postprocessing,
summarization_config=summarization_config,
translation_config=translation_config)

response = self._make_http_request(
"POST",
Expand Down Expand Up @@ -161,7 +171,9 @@ def submit_job_local_file(
notification_config=None,
skip_postprocessing=False,
remove_atmospherics=False,
speakers_count=None):
speakers_count=None,
summarization_config: SummarizationOptions = None,
translation_config: TranslationOptions = None):
"""Submit a local file for transcription.
Note that the content type is inferred if not provided.

Expand Down Expand Up @@ -208,6 +220,8 @@ def submit_job_local_file(
:param remove_atmospherics: Atmospherics such as <laugh>, <affirmative>, etc. will not
appear in the transcript.
:param speakers_count: Use to specify the total number of unique speakers in the audio.
:param summarization_config: Use to request transcript summary.
:param translation_config: Use to request transcript translation.
:returns: raw response data
:raises: HTTPError, ValueError
"""
Expand All @@ -222,7 +236,9 @@ def submit_job_local_file(
language, custom_vocabulary_id, transcriber,
verbatim, rush, test_mode,
segments_to_transcribe, speaker_names, None,
notification_config, skip_postprocessing)
notification_config, skip_postprocessing,
summarization_config=summarization_config,
translation_config=translation_config)

with open(filename, 'rb') as f:
files = {
Expand Down Expand Up @@ -451,6 +467,160 @@ def get_account(self):

return Account.from_json(response.json())

def get_transcript_summary_text(self, id_):
"""Get the transcript summary of a specific job as plain text.

:param id_: id of job to be requested
:returns: transcript data as text
:raises: HTTPError
"""
if not id_:
raise ValueError('id_ must be provided')

response = self._make_http_request(
"GET",
urljoin(self.base_url, 'jobs/{}/transcript/summary'.format(id_)),
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any way to extract 'jobs/{}/transcript/summary' to be reusable?

headers={'Accept': 'text/plain'}
)
return response.text

def get_transcript_summary_json(self, id_):
"""Get the transcript summary of a specific job as json.

:param id_: id of job to be requested
:returns: transcript data as json
:raises: HTTPError
"""
if not id_:
raise ValueError('id_ must be provided')

response = self._make_http_request(
"GET",
urljoin(self.base_url, 'jobs/{}/transcript/summary'.format(id_)),
headers={'Accept': 'application/json'}
)

return Summary.from_json(response.json())

def get_transcript_summary_json_as_stream(self, id_):
"""Get the transcript summary of a specific job as streamed json.

:param id_: id of job to be requested
:returns: requests.models.Response HTTP response which can be used to stream
the payload of the response
:raises: HTTPError
"""
if not id_:
raise ValueError('id_ must be provided')

response = self._make_http_request(
"GET",
urljoin(self.base_url, 'jobs/{}/transcript/summary'.format(id_)),
headers={'Accept': 'application/json'},
stream=True
)

return response

def get_translated_transcript_text(self, id_, language):
"""Get the translated transcript of a specific job as plain text.

:param id_: id of job to be requested
:param language: requested language
:returns: transcript data as text
:raises: HTTPError
"""
if not id_:
raise ValueError('id_ must be provided')

response = self._make_http_request(
"GET",
urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)),
headers={'Accept': 'text/plain'}
)

return response.text

def get_translated_transcript_text_as_stream(self, id_, language):
"""Get the translated transcript of a specific job as a plain text stream.

:param id_: id of job to be requested
:param language: requested language
:returns: requests.models.Response HTTP response which can be used to stream
the payload of the response
:raises: HTTPError
"""
if not id_:
raise ValueError('id_ must be provided')

response = self._make_http_request(
"GET",
urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)),
headers={'Accept': 'text/plain'},
stream=True
)

return response

def get_translated_transcript_json(self, id_, language):
"""Get the translated transcript of a specific job as json.

:param id_: id of job to be requested
:param language: requested language
:returns: transcript data as json
:raises: HTTPError
"""
if not id_:
raise ValueError('id_ must be provided')

response = self._make_http_request(
"GET",
urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)),
headers={'Accept': self.rev_json_content_type}
)

return response.json()

def get_translated_transcript_json_as_stream(self, id_, language):
"""Get the translated transcript of a specific job as streamed json.

:param id_: id of job to be requested
:param language: requested language
:returns: requests.models.Response HTTP response which can be used to stream
the payload of the response
:raises: HTTPError
"""
if not id_:
raise ValueError('id_ must be provided')

response = self._make_http_request(
"GET",
urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)),
headers={'Accept': self.rev_json_content_type},
stream=True
)

return response

def get_translated_transcript_object(self, id_, language):
"""Get the translated transcript of a specific job as a python object`.

:param id_: id of job to be requested
:param language: requested language
:returns: transcript data as a python object
:raises: HTTPError
"""
if not id_:
raise ValueError('id_ must be provided')

response = self._make_http_request(
"GET",
urljoin(self.base_url, 'jobs/{}/transcript/translation/{}'.format(id_, language)),
headers={'Accept': self.rev_json_content_type}
)

return Transcript.from_json(response.json())

def _create_job_options_payload(
self,
media_url=None,
Expand All @@ -475,7 +645,9 @@ def _create_job_options_payload(
notification_config=None,
skip_postprocessing=False,
remove_atmospherics=None,
speakers_count=None):
speakers_count=None,
summarization_config: SummarizationOptions = None,
translation_config: TranslationOptions = None):
payload = {}
if media_url:
payload['media_url'] = media_url
Expand Down Expand Up @@ -512,7 +684,7 @@ def _create_job_options_payload(
if segments_to_transcribe:
payload['segments_to_transcribe'] = segments_to_transcribe
if speaker_names:
payload['speaker_names'] =\
payload['speaker_names'] = \
utils._process_speaker_names(speaker_names)
if source_config:
payload['source_config'] = source_config.to_dict()
Expand All @@ -524,6 +696,10 @@ def _create_job_options_payload(
payload['remove_atmospherics'] = remove_atmospherics
if speakers_count:
payload['speakers_count'] = speakers_count
if summarization_config:
payload['summarization_config'] = summarization_config.to_dict()
if translation_config:
payload['translation_config'] = translation_config.to_dict()
return payload

def _create_captions_query(self, speaker_channel):
Expand Down
13 changes: 10 additions & 3 deletions src/rev_ai/models/asynchronous/job.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# -*- coding: utf-8 -*-
"""Job model"""

from .summarization_options import Summarization
from .job_status import JobStatus
from .translation_options import Translation


class Job:
Expand All @@ -28,7 +29,9 @@ def __init__(
rush=None,
segments_to_transcribe=None,
remove_atmospherics=None,
speakers_count=None):
speakers_count=None,
summarization: Summarization = None,
translation: Translation = None):
Comment on lines +32 to +34
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
speakers_count=None,
summarization: Summarization = None,
translation: Translation = None):
speakers_count=None,
summarization: Summarization=None,
translation: Translation=None):

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as bellow:

I follow PEP8, which is commonly accepted standard for python formatting, PyCharm supports PEP 8 and suggests this formatting here. I Used autoformat everywhere where reasonable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is less matter what is the autoformatting when all the remaining code will be formatted differently.

at the same time, as per PEP8

Don’t use spaces around the = sign when used to indicate a keyword argument, or when used to indicate a default value for an unannotated function parameter:
# Correct:
def complex(real, imag=0.0):
    return magic(r=real, i=imag)
# Wrong:
def complex(real, imag = 0.0):
    return magic(r = real, i = imag)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed on Zoom. Agreed that formatting of arguments with type annotation is different according to the standard.

"""
:param id_: unique id of job
:param created_on: date and time at which this job was started
Expand Down Expand Up @@ -85,6 +88,8 @@ def __init__(
self.segments_to_transcribe = segments_to_transcribe
self.remove_atmospherics = remove_atmospherics
self.speakers_count = speakers_count
self.summarization = summarization
self.translation = translation

def __eq__(self, other):
"""Override default equality operator"""
Expand Down Expand Up @@ -120,5 +125,7 @@ def from_json(cls, json):
rush=json.get('rush'),
segments_to_transcribe=json.get('segments_to_transcribe'),
remove_atmospherics=json.get('remove_atmospherics'),
speakers_count=json.get('speakers_count')
speakers_count=json.get('speakers_count'),
summarization=Summarization.from_json(json.get('summarization')),
translation=Translation.from_json(json.get('translation'))
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from enum import Enum
"""Summarization formatting options."""


class SummarizationFormattingOptions(str, Enum):
PARAGRAPH = "paragraph"
BULLETS = "bullets"
10 changes: 10 additions & 0 deletions src/rev_ai/models/asynchronous/summarization_job_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
"""Enum for Summarization Job statuses"""

from enum import Enum


class SummarizationJobStatus(str, Enum):
IN_PROGRESS = "in_progress"
FAILED = "failed"
COMPLETED = "completed"
63 changes: 63 additions & 0 deletions src/rev_ai/models/asynchronous/summarization_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from .summarization_formatting_options import SummarizationFormattingOptions
from .summarization_job_status import SummarizationJobStatus
from ..nlp_model import NlpModel

"""Summarization request options."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be inside the class



class SummarizationOptions:
def __init__(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

based on other files, it looks like the trailing ): should be on the same line with the last argument.

nit, but this is a public code. also, there should be a comment

self,
prompt: str = None,
model: NlpModel = None,
formattingType: SummarizationFormattingOptions = None
):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this formatting correct?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used auto formatter as suggested with linter, no complaints.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I follow PEP8, which is commonly accepted standard for python formatting, PyCharm supports PEP 8 and suggests this formatting here. I Used autoformat everywhere where reasonable.

self.prompt = prompt
self.model = model
self.type = formattingType

def to_dict(self):
"""Returns the raw form of the url data object as the api
expects them"""
dict_result = {}
if self.prompt:
dict_result['prompt'] = self.prompt
if self.model:
dict_result['model'] = self.model
if self.type:
dict_result['type'] = self.type

return dict_result


"""Summarization options."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure what kind of syntax is this. I believe this should be inside the class



class Summarization(SummarizationOptions):
def __init__(
self,
prompt: str = None,
model: NlpModel = None,
formattingType: SummarizationFormattingOptions = None,
status: SummarizationJobStatus = None,
completed_on: str = None,
failure: str = None
):
super().__init__(prompt, model, formattingType)
self.status = status
self.completed_on = completed_on
self.failure = failure

@classmethod
def from_json(cls, json):
if json is None:
return None

return cls(
json.get('prompt'),
json.get('model'),
json.get('type'),
json.get('status'),
json.get('completed_on'),
json.get('failure')
)
Loading