Merge pull request #19 from radicalxdev/Decouple

Dynamo V2
marvelai-org · May 20, 2024 · 015bc6c · 015bc6c
2 parents c1dfdfc + 65b7d3f
commit 015bc6c
Show file tree

Hide file tree

Showing 7 changed files with 138 additions and 75 deletions.
diff --git a/app/api/router.py b/app/api/router.py
@@ -18,9 +18,6 @@ async def submit_tool( data: ToolRequest, _ = Depends(key_check)):
     # Unpack GenericRequest for tool data
     request_data = data.tool_data
 
-    print(type(request_data))
-    print(request_data)
-
     requested_tool = load_tool_metadata(request_data.tool_id)
     request_inputs_dict = prepare_input_data(request_data)
 
@@ -29,10 +26,12 @@ async def submit_tool( data: ToolRequest, _ = Depends(key_check)):
         logger.error(f"Inputs: {request_inputs_dict}")
         logger.error(f"Firestore inputs: {requested_tool['inputs']}")
         raise HTTPException(status_code=400, detail="Input validation failed")
+    else:
+        logger.info(f"Input validation passed")
 
     result = execute_tool(request_data.tool_id, request_inputs_dict)
 
-    return ToolResponse(data=[result])
+    return ToolResponse(data=result)
 
 @router.post("/chat", response_model=ChatResponse)
 async def chat( request: ChatRequest, _ = Depends(key_check) ):

diff --git a/app/features/dynamo/core.py b/app/features/dynamo/core.py
@@ -1,9 +1,21 @@
-from features.dynamo.tools import find_key_concepts, retrieve_youtube_documents
-
+from features.dynamo.tools import summarize_transcript, generate_flashcards
+from services.logger import setup_logger
 # TODO: Implement the executor function's verbose param to downstream logic
 
+logger = setup_logger(__name__)
+
 def executor(youtube_url: str, verbose=False):
-    yt_documents = retrieve_youtube_documents(youtube_url)
-    concepts = find_key_concepts(yt_documents)
-
-    return concepts
+    summary = summarize_transcript(youtube_url, verbose=verbose)
+    flashcards = generate_flashcards(summary)
+
+    sanitized_flashcards = []
+    for flashcard in flashcards:
+        if 'concept' in flashcard and 'definition' in flashcard:
+            sanitized_flashcards.append({
+                "concept": flashcard['concept'],
+                "definition": flashcard['definition']
+            })
+        else:
+            logger.warning(f"Malformed flashcard skipped: {flashcard}")
+
+    return sanitized_flashcards 
diff --git a/app/features/dynamo/prompt/dynamo-prompt.txt b/app/features/dynamo/prompt/dynamo-prompt.txt
@@ -1,11 +1,17 @@
-You are a student a text for your exam. Consider the following transcript from a video and find the core idea or concept along with a definition. This will be used to create a flashcard to help you study. You must provide a definition for the concept. Follow the format instructions provided.
-
-Transcript:
--------------------------------
-{text}
-
-Instructions:
--------------------------------
+You are a flashcard generation assistant designed to help students analyze a document and return a list of flashcards. Carefully consider the document and analyze what are the key terms or concepts relevant for students to better understand the topic. The topics provided will vary in a wide range of subjects as they are a summarized transcript from a youtube video; as such, all information provided is meant to be educational and all provided content is meant to educate students in the flashcards. You only respond in the response formatting provided. Do not apply any markdown or extra characters to your response.
+
+Input:
+-----------------------------
+{summary}
+
+Examples:
+-----------------------------
+{examples}
+
+Formatting:
+-----------------------------
 {format_instructions}
 
-Respond only with JSON with the concept and definition.
+Respond only according to the format instructions. The examples included are best responses noted by an input and output example.
+
+Output:
diff --git a/app/features/dynamo/prompt/examples.txt b/app/features/dynamo/prompt/examples.txt
@@ -0,0 +1,60 @@
+input:
+## Concise Summary of the provided document:
+
+**Large Language Models (LLMs)** are powerful AI tools trained on massive datasets to perform tasks like text generation, translation, and question answering. They can be specialized for specific domains through fine-tuning, making them versatile and adaptable. 
+
+**Key points:**
+
+* **Pre-trained and fine-tuned:** LLMs learn general knowledge from large datasets and specialize in specific tasks through additional training. 
+* **Prompt design:** Effective prompts are crucial for eliciting desired responses from LLMs.
+* **Domain knowledge:** Understanding the specific domain is essential for building and tuning LLMs.
+* **Parameter-efficient tuning methods:** This method allows for efficient customization of LLMs without altering the entire model.
+* **Vertex AI:** Provides tools for building, tuning, and deploying LLMs for specific tasks.
+* **Generative AI App Builder and PaLM API:** Tools for developers to build AI apps and experiment with LLMs.
+* **Model management tools:** Tools for training, deploying, and monitoring ML models.
+
+**This document provides a comprehensive overview of LLMs and related tools, highlighting their capabilities and potential applications.** 
+
+**Additional notes:**
+
+* The text emphasizes the importance of prompt design and domain knowledge for effective LLM usage.
+* It introduces cutting-edge technologies like PETM and Vertex AI, showcasing the rapid advancements in the field.
+* The document also provides practical resources for developers to build and deploy LLM-powered applications. 
+
+**Overall, this document is a valuable resource for anyone interested in understanding and utilizing LLMs and related technologies.**
+
+Output:
+[
+  {
+    "concept": "Large Language Models (LLMs)",
+    "definition": "Powerful AI tools trained on massive datasets to perform tasks like text generation, translation, and question answering."
+  },
+  {
+    "concept": "Pre-trained and fine-tuned",
+    "definition": "LLMs learn general knowledge from large datasets and specialize in specific tasks through additional training."
+  },
+  {
+    "concept": "Prompt design",
+    "definition": "Effective prompts are crucial for eliciting desired responses from LLMs."
+  },
+  {
+    "concept": "Domain knowledge",
+    "definition": "Understanding the specific domain is essential for building and tuning LLMs."
+  },
+  {
+    "concept": "Parameter-efficient tuning methods",
+    "definition": "This method allows for efficient customization of LLMs without altering the entire model."
+  },
+  {
+    "concept": "Vertex AI",
+    "definition": "Provides tools for building, tuning, and deploying LLMs for specific tasks."
+  },
+  {
+    "concept": "Generative AI App Builder and PaLM API",
+    "definition": "Tools for developers to build AI apps and experiment with LLMs."
+  },
+  {
+    "concept": "Model management tools",
+    "definition": "Tools for training, deploying, and monitoring ML models."
+  }
+]
diff --git a/app/features/dynamo/tools.py b/app/features/dynamo/tools.py
@@ -3,10 +3,12 @@
 from langchain.prompts import PromptTemplate
 from langchain_google_vertexai import VertexAI
 from langchain_core.output_parsers import JsonOutputParser
+from langchain.chains.summarize import load_summarize_chain
+from langchain_core.pydantic_v1 import BaseModel, Field
 from services.logger import setup_logger
-from pydantic import BaseModel, Field
 import os
 
+
 logger = setup_logger(__name__)
 
 # AI Model
@@ -22,78 +24,61 @@ def read_text_file(file_path):
     with open(absolute_file_path, 'r') as file:
         return file.read()
 
-
-# Youtube Loader # Chunk and Splitter
-def retrieve_youtube_documents(youtube_url: str):
-    """Retrieve youtbe transcript and create a list of documents"""
+# Summarize chain
+def summarize_transcript(youtube_url: str, max_video_length=600, verbose=False) -> str:
     loader = YoutubeLoader.from_youtube_url(youtube_url, add_video_info=True)
     splitter = RecursiveCharacterTextSplitter(
         chunk_size = 1000,
         chunk_overlap = 0
     )
-
     docs = loader.load()
+    split_docs = splitter.split_documents(docs)
 
-    length = docs[0].metadata["length"]
-    title = docs[0].metadata["title"]
-
-    logger.info(f"Found video with title: {title} and length: {length}")
-
-    # If docs empty, throw error
     if not docs:
         raise ValueError("No documents found")
-
-    # if docs too long, throw error
-    if length > 1200: # 20 minutes
-        raise ValueError("Video too long")
-
-    return splitter.split_documents(docs)
-
 
-# Num sampler
-def find_key_concepts(documents: list, sample_size: int = 6):
-    """Iterate through all documents of group size N and find key concepts"""
-    if sample_size > len(documents):
-        sample_size = len(documents) // 5
-
-    num_docs_per_group = len(documents) // sample_size + (len(documents) % sample_size > 0)
+    length = docs[0].metadata["length"]
+    title = docs[0].metadata["title"]
+
+    if length > max_video_length:
+        raise ValueError(f"Video is too long, please provide a video less than {max_video_length} seconds long")
+
+    if verbose:
+        logger.info(f"Found video with title: {title} and length: {length}")
+        logger.info(f"Splitting documents into {len(split_docs)} chunks")
 
-    if num_docs_per_group > 5:
-        num_docs_per_group = 6 # Default to 6 if too many documents
-        logger.info(f"Number of documents per group is too large. Defaulting to {num_docs_per_group}")
+    chain = load_summarize_chain(model, chain_type='map_reduce')
+    response = chain(split_docs)
 
-    groups = [documents[i:i + num_docs_per_group] for i in range(0, len(documents), num_docs_per_group)]
+    if response and verbose: logger.info("Successfully completed generating summary")
 
+    return response
+
+def generate_flashcards(summary: str, verbose=False) -> list:
+    # Receive the summary from the map reduce chain and generate flashcards
     parser = JsonOutputParser(pydantic_object=Flashcard)
 
-    batch_concept = []
-
-    logger.info(f"Beginning to process {len(groups)} groups")
+    if verbose: logger.info(f"Beginning to process summary")
 
     template = read_text_file("prompt/dynamo-prompt.txt")
+    examples = read_text_file("prompt/examples.txt")
+
     prompt = PromptTemplate(
-                template = template,
-                input_variables=["text"],
-                partial_variables={"format_instructions": parser.get_format_instructions()}
-            )
-    # Create Chain
+        template=template,
+        input_variables=["summary", "examples"],
+        partial_variables={"format_instructions": parser.get_format_instructions()}
+    )
+
     chain = prompt | model | parser
 
-    for group in groups:
-        group_content = ""
-
-        for doc in group:
-            group_content += doc.page_content
-
-            # Run Chain
-            output_concept = chain.invoke({"text": group_content})
-
-            logger.info(f"Output concept: {output_concept}\n")
-
-            batch_concept.append(output_concept)
-
-    return batch_concept
+    try:
+        response = chain.invoke({"summary": summary, "examples": examples})
+    except Exception as e:
+        logger.error(f"Failed to generate flashcards: {e}")
+        return []
+
+    return response
 
 class Flashcard(BaseModel):
-    concept: str = Field(description="The concept or term")
-    definition: str = Field(description="The summarized definition of the concept or term")
+    concept: str = Field(description="The concept of the flashcard")
+    definition: str = Field(description="The definition of the flashcard")
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -3,7 +3,8 @@ uvicorn[standard]
 langchain
 langchain-core
 langchain-google-vertexai
-langchain_chroma
+langchain-chroma
+langchain-community
 google-cloud-secret-manager
 google-cloud-logging
 google-auth

diff --git a/app/services/schemas.py b/app/services/schemas.py
@@ -47,7 +47,7 @@ class ChatResponse(BaseModel):
     data: List[Message]
 
 class ToolResponse(BaseModel):
-    data: List[Any]
+    data: Any
 
 class ChatMessage(BaseModel):
     role: str