diff --git a/apis/paios/openapi.yaml b/apis/paios/openapi.yaml index 5d56f520..fe6d095f 100644 --- a/apis/paios/openapi.yaml +++ b/apis/paios/openapi.yaml @@ -831,6 +831,67 @@ paths: headers: X-Total-Count: $ref: '#/components/headers/X-Total-Count' + post: + summary: Create new voice message. + description: Creates a new voice message + operationId: backend.api.VoicesFacesView.post + tags: + - Voice Management + responses: + '200': + description: OK + content: + application/json: + schema: + type: object + properties: + chat_response: + type: string + description: The chat response message. + headers: + Content-Disposition: + description: Indicates that the response includes an attachment. + schema: + type: string + '201': + description: Created + content: + application/json: + schema: + $ref: '#/components/schemas/VoiceMessage' + '206': + description: Partial Content (MP3 file streamed) + content: + audio/mpeg: + schema: + type: string + format: binary + headers: + Content-Disposition: + description: Indicates that the response includes an attachment. + schema: + type: string + '400': + description: Missing Required Information + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/VoiceMessageCreate' + '/voices/{id}': + delete: + summary: Delete voice message by id + description: Deletes the voice message with the given id. + operationId: backend.api.VoicesFacesView.delete + tags: + - Voice Management + parameters: + - $ref: '#/components/parameters/id' + responses: + '204': + description: No Content + '404': + description: Not Found tags: - name: Abilities Management description: Installation and configuration of abilities @@ -1386,7 +1447,12 @@ components: type: object title: VoiceCreate description: Voice without id which is server-generated. - properties: + properties: + xi_id: + $ref: '#/components/schemas/textShort' + name: + $ref: '#/components/schemas/textShort' + properties: xi_id: $ref: '#/components/schemas/textShort' name: @@ -1394,14 +1460,25 @@ components: text_to_speak: type: string nullable: true - image_url: - $ref: '#/components/schemas/voice_image_url' + VoiceMessage: + type: object + title: VoiceMessage + properties: + audio_msg_path: + type: string + format: uri nullable: true - sample_url: - $ref: '#/components/schemas/sample_mp3_url' - nullable: true required: - - id + - audio_msg_path + VoiceMessageCreate: + type: object + title: VoiceMessageCreate + description: Voice without id which is server-generated. + properties: + msg_id: + $ref: '#/components/schemas/uuid4' + required: + - msg_id Face: type: object title: Face @@ -1429,6 +1506,9 @@ components: $ref: '#/components/schemas/textLong' chat_response: type: string + voice_active: + $ref: '#/components/schemas/boolean_str' + nullable: true required: - assistant_id - conversation_id @@ -1447,6 +1527,8 @@ components: nullable: true prompt: $ref: '#/components/schemas/textLong' + voice_active: + $ref: '#/components/schemas/boolean_str' required: - assistant_id - prompt diff --git a/backend/api/MessagesView.py b/backend/api/MessagesView.py index e44e1315..debe1904 100644 --- a/backend/api/MessagesView.py +++ b/backend/api/MessagesView.py @@ -1,9 +1,9 @@ -from starlette.responses import JSONResponse, Response +from starlette.responses import JSONResponse from backend.managers.MessagesManager import MessagesManager from common.paths import api_base_url from backend.pagination import parse_pagination_params from backend.schemas import MessageCreateSchema - +from starlette.responses import JSONResponse class MessagesView: def __init__(self): @@ -12,10 +12,10 @@ def __init__(self): async def post(self, body: MessageCreateSchema): response, error_message = await self.mm.create_message(body) if error_message: - return JSONResponse({"error": error_message}, status_code=404) + return JSONResponse({"error": error_message}, status_code=404) - if body.get("conversation_id"): # If conversation_id was provided, retrieve the full message + if body.get("conversation_id"): message = await self.mm.retrieve_message(response) return JSONResponse(message.dict(), status_code=201, headers={'Location': f'{api_base_url}/messages/{response}'}) - else: # If conversation_id was not provided, return only the chat_response + else: return JSONResponse({"chat_response": response}, status_code=200) \ No newline at end of file diff --git a/backend/api/VoicesFacesView.py b/backend/api/VoicesFacesView.py index 228629f3..3cc0d0ca 100644 --- a/backend/api/VoicesFacesView.py +++ b/backend/api/VoicesFacesView.py @@ -1,22 +1,20 @@ -from starlette.responses import JSONResponse +from starlette.responses import JSONResponse, Response from backend.managers.VoicesFacesManager import VoicesFacesManager +from backend.managers.MessagesManager import MessagesManager from common.paths import api_base_url from backend.pagination import parse_pagination_params from backend.schemas import VoiceCreateSchema +from starlette.responses import JSONResponse, StreamingResponse +import os +from pathlib import Path +import shutil class VoicesFacesView: def __init__(self): - self.vfm = VoicesFacesManager() - # TODO: Finish text to speech - async def post(self, id: str, body: VoiceCreateSchema): - response, error_message = await self.vfm.text_to_speech(id, body) - if error_message: - return JSONResponse({"error": error_message}, status_code=404) - else: - return JSONResponse(response, status_code=200) - + self.vfm = VoicesFacesManager() + self.mm = MessagesManager() async def search(self, filter: str = None, range: str = None, sort: str = None): result = parse_pagination_params(filter, range, sort) @@ -36,4 +34,24 @@ async def search(self, filter: str = None, range: str = None, sort: str = None): 'X-Total-Count': str(total_count), 'Content-Range': f'voices {offset}-{offset + len(voices) - 1}/{total_count}' } - return JSONResponse([voice.dict() for voice in voices], status_code=200, headers=headers) \ No newline at end of file + return JSONResponse([voice.dict() for voice in voices], status_code=200, headers=headers) + + + async def post(self, body: VoiceCreateSchema): + msg_id = body.get('msg_id') + message = await self.mm.retrieve_message(msg_id) + if message.voice_active == 'True': + audio_msg_path, error_message = await self.vfm.generate_voice_response(message.assistant_id, message.chat_response, msg_id) + if error_message: + return JSONResponse({"error": error_message}, status_code=404) + if audio_msg_path and os.path.exists(audio_msg_path): + streaming_response = StreamingResponse( + self.vfm.async_file_generator(audio_msg_path), + media_type='audio/mpeg', + headers={ + "Content-Disposition": f"attachment; filename={os.path.basename(audio_msg_path)}" + } + ) + return streaming_response + else: + return JSONResponse({"error": "File not found"}, status_code=404) \ No newline at end of file diff --git a/backend/managers/MessagesManager.py b/backend/managers/MessagesManager.py index 812f39c9..3efdbaaf 100644 --- a/backend/managers/MessagesManager.py +++ b/backend/managers/MessagesManager.py @@ -73,11 +73,11 @@ async def create_message(self, message_data: MessageCreateSchema) -> Tuple[Optio rm = RagManager() response = await rm.retrieve_and_generate(assistant_id, query, llm) chat_response = response["answer"] - + if conversation_id: - message_data["chat_response"] = chat_response + message_data['chat_response'] = chat_response message_data['timestamp'] = timestamp - + new_message = Message(id=str(uuid4()), **message_data) session.add(new_message) await session.commit() @@ -100,7 +100,8 @@ async def retrieve_message(self, id:str) -> Optional[MessageSchema]: conversation_id=message.conversation_id, timestamp=message.timestamp, prompt=message.prompt, - chat_response=message.chat_response, + chat_response=message.chat_response, + voice_active=message.voice_active ) return None diff --git a/backend/managers/VoicesFacesManager.py b/backend/managers/VoicesFacesManager.py index 5034284b..6858b664 100644 --- a/backend/managers/VoicesFacesManager.py +++ b/backend/managers/VoicesFacesManager.py @@ -8,6 +8,8 @@ from typing import List, Tuple, Optional, Dict, Any from sqlalchemy import select, func from backend.schemas import VoiceSchema +from pathlib import Path +from backend.models import Resource, Persona class VoicesFacesManager: @@ -112,13 +114,17 @@ async def retrieve_voices(self, offset: int = 0, limit: int = 100, sort_by: Opti return voices, total_count - - async def text_to_speech(self, voice_id: str, body) -> str: + async def text_to_speech(self, voice_id: str, body: str, assistant_id: str, msg_id: str) -> Tuple[Optional[dict], Optional[str]]: + voice = await self.retrieve_voice(voice_id) xi_api_key = os.environ.get('XI_API_KEY') - xi_id = body['xi_id'] - OUTPUT_PATH = f"{xi_id}.mp3" # Path to save the output audio file - - tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream" + xi_id = voice.xi_id + temp = os.path.dirname(os.path.realpath(__file__)) + directory = Path(os.path.join(os.path.dirname(temp), f'public/{assistant_id}')) + directory.mkdir(parents=True, exist_ok=True) + file_path = directory / f'{msg_id}.mp3' + audio_msg_path = str(file_path) + print('audio_msg_path: ', audio_msg_path) + tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{xi_id}/stream" headers = { "Accept": "application/json", @@ -126,7 +132,7 @@ async def text_to_speech(self, voice_id: str, body) -> str: } data = { - "text": "Hola" , + "text": body, "model_id": "eleven_multilingual_v2", "voice_settings": { "stability": 0.5, @@ -135,16 +141,42 @@ async def text_to_speech(self, voice_id: str, body) -> str: "use_speaker_boost": True } } - - response = requests.post(tts_url, headers=headers, json=data, stream=True) - - if response.ok: - with open(OUTPUT_PATH, "wb") as f: - for chunk in response.iter_content(chunk_size=os.environ.get('XI_CHUNK_SIZE')): - f.write(chunk) - print("Audio stream saved successfully.") - else: - print(response.text) - return response.text + try: + response = requests.post(tts_url, headers=headers, json=data, stream=True) + if response.ok: + with open(audio_msg_path, "wb") as f: + for chunk in response.iter_content(chunk_size=int(os.environ.get('XI_CHUNK_SIZE'))): + f.write(chunk) + print("Audio stream saved successfully.") + return {"message": "Audio stream saved successfully.", "audio_msg_path": audio_msg_path}, None + else: + error_message = response.text + print(error_message) + return None, error_message + except Exception as e: + error_message = str(e) + print(f"An error occurred: {error_message}") + return None, error_message - + async def generate_voice_response(self, assistant_id, chat_response, message_id) -> Tuple[Optional[dict], Optional[str]]: + try: + async with db_session_context() as session: + resource = await session.execute(select(Resource).filter(Resource.id == assistant_id)) + resource = resource.scalar_one_or_none() + persona_id = resource.persona_id + if persona_id: + persona = await session.execute(select(Persona).filter(Persona.id == persona_id)) + persona = persona.scalar_one_or_none() + voice_id = persona.voice_id + vfm = VoicesFacesManager() + response, error_message = await vfm.text_to_speech(voice_id, chat_response, assistant_id, message_id) + if error_message: + return None, error_message + return response.get("audio_msg_path"), None + except Exception as e: + return None, f"An unexpected error occurred while generating a voice response: {str(e)}" + + async def async_file_generator(self, file_path): + with open(file_path, 'rb') as audio_file: + while chunk := audio_file.read(1024): + yield chunk \ No newline at end of file diff --git a/backend/models.py b/backend/models.py index a918cb87..bf8a8b3d 100644 --- a/backend/models.py +++ b/backend/models.py @@ -73,6 +73,7 @@ class Message(Base): timestamp = Column(String, nullable=False) prompt = Column(String, nullable=False) chat_response = Column(String, nullable=False) + voice_active = Column(String, nullable=False) class Conversation(Base): __tablename__ = "conversation" diff --git a/backend/schemas.py b/backend/schemas.py index 118d0804..d5c2b336 100644 --- a/backend/schemas.py +++ b/backend/schemas.py @@ -81,7 +81,9 @@ class VoiceBaseSchema(BaseModel): name: str text_to_speak: Optional[str] = None image_url: Optional[str] = None - sample_url: Optional[str] = None + sample_url: Optional[str] = None + msg_id: Optional[str] = None + audio_msg_path: Optional[str] = None class Config: orm_mode = True from_attributes = True @@ -119,6 +121,7 @@ class MessageBaseSchema(BaseModel): timestamp: str prompt: str chat_response: str + voice_active: str class Config: orm_mode = True from_attributes = True @@ -136,7 +139,7 @@ class ConversationBaseSchema(BaseModel): last_updated_timestamp: str archive: str assistant_id: str - messages: Optional[List[MessageBaseSchema]] = None + messages: Optional[List[MessageSchema]] = None class Config: orm_mode = True @@ -162,4 +165,4 @@ class FileCreateSchema(FileBaseSchema): pass class FileSchema(FileBaseSchema): - id: str + id: str \ No newline at end of file diff --git a/migrations/versions/54bbca89da7b_added_message_table.py b/migrations/versions/54bbca89da7b_added_message_table.py index dcb9ca4f..afffb6e0 100644 --- a/migrations/versions/54bbca89da7b_added_message_table.py +++ b/migrations/versions/54bbca89da7b_added_message_table.py @@ -32,6 +32,7 @@ def upgrade() -> None: sa.Column('timestamp', sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column('prompt', sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column('chat_response', sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column('voice_active', sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.PrimaryKeyConstraint('id') )