Merge pull request #88 from BillFarber/task/extendExamples

BillFarber · web-flow · commit 94b2beb6be0f · 2024-09-11T15:20:55.000-04:00
Some refactoring first
diff --git a/examples/langchain/README.md b/examples/langchain/README.md
@@ -43,28 +43,36 @@ into two different collections in the `langchain-test-content` database:
 
     python load_data.py
 
-Create a ".env" file to hold your OpenAI API key:
-
-    echo "OPENAI_API_KEY=<your key here>" > .env
+Create a ".env" file to hold your AzureOpenAI environment values. It should look
+something like this.
+```
+OPENAI_API_VERSION=2023-12-01-preview
+AZURE_OPENAI_ENDPOINT=<Your Azure OpenAI Endpoint>
+AZURE_OPENAI_API_KEY=<Your Azure OpenAI API Key>
+AZURE_LLM_DEPLOYMENT_NAME=gpt-test1-gpt-35-turbo
+AZURE_LLM_DEPLOYMENT_MODEL=gpt-35-turbo
+```
 
 # Testing the retriever
 
+## Testing using a retriever with a basic query
+
 You are now ready to test the example retriever. Run the following to ask a question with the 
 results augmented via the `marklogic_retriever.py` module in this project; you will be 
-prompted for an OpenAI API key when you run this, which you can type or paste in:
+prompted for an AzureOpenAI API key when you run this, which you can type or paste in:
 
-    python ask.py "What is task decomposition?" posts
+    python ask_similar_query.py "What is task decomposition?" posts
 
 The retriever uses a [cts.similarQuery](https://docs.marklogic.com/cts.similarQuery) to select from the documents 
 loaded via `load_data.py`. It defaults to a page length of 10. You can change this by providing a command line
 argument - e.g.:
 
-    python ask.py "What is task decomposition?" posts 15
+    python ask_similar_query.py "What is task decomposition?" posts 15
 
 Example of a question for the "sotu" (State of the Union speech) collection:
 
-    python ask.py "What are economic sanctions?" sotu 20
+    python ask_similar_query.py "What are economic sanctions?" sotu 20
 
 To use a word query instead of a similar query, along with a set of drop words, specify "word" as the 4th argument:
 
-    python ask.py "What are economic sanctions?" sotu 20 word
+    python ask_similar_query.py "What are economic sanctions?" sotu 20 word
diff --git a/examples/langchain/ask_similar_query.py b/examples/langchain/ask_similar_query.py
@@ -1,14 +1,15 @@
 # Based on example at
-# https://python.langchain.com/docs/use_cases/question_answering/quickstart . 
+# https://python.langchain.com/docs/use_cases/question_answering/quickstart .
 
+import os
 import sys
 from dotenv import load_dotenv
 from langchain import hub
-from langchain_openai import ChatOpenAI
+from langchain_openai import AzureChatOpenAI
 from langchain.schema import StrOutputParser
 from langchain.schema.runnable import RunnablePassthrough
 from marklogic import Client
-from marklogic_retriever import MarkLogicRetriever
+from marklogic_similar_query_retriever import MarkLogicSimilarQueryRetriever
 
 
 def format_docs(docs):
@@ -17,7 +18,7 @@ def format_docs(docs):
 
 question = sys.argv[1]
 
-retriever = MarkLogicRetriever.create(
+retriever = MarkLogicSimilarQueryRetriever.create(
     Client("http://localhost:8003", digest=("langchain-user", "password"))
 )
 retriever.collections = [sys.argv[2]]
@@ -28,10 +29,20 @@ def format_docs(docs):
 load_dotenv()
 
 prompt = hub.pull("rlm/rag-prompt")
-llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
+# Note that the Azure OpenAI API key, the Azure OpenAI Endpoint, and the OpenAI API
+# Version, are all read from the environment automatically.
+llm = AzureChatOpenAI(
+    model_name=os.getenv("AZURE_LLM_DEPLOYMENT_NAME"),
+    azure_deployment=os.getenv("AZURE_LLM_DEPLOYMENT_NAME"),
+    temperature=0,
+    max_tokens=None,
+    timeout=None,
+)
 
 rag_chain = (
     {"context": retriever | format_docs, "question": RunnablePassthrough()}
-    | prompt | llm | StrOutputParser()
+    | prompt
+    | llm
+    | StrOutputParser()
 )
 print(rag_chain.invoke(question))
diff --git a/examples/langchain/marklogic_similar_query_retriever.py b/examples/langchain/marklogic_similar_query_retriever.py
@@ -10,20 +10,32 @@
 """
 
 
-class MarkLogicRetriever(BaseRetriever):
+class MarkLogicSimilarQueryRetriever(BaseRetriever):
 
     client: Client
     max_results: int = 10
     collections: List[str] = []
     query_type: str = "similar"
-    drop_words = ["did", "the", "about", "a", "an", "is", "are", "what", 
-                  "say", "do", "was", "that"]
+    drop_words = [
+        "did",
+        "the",
+        "about",
+        "a",
+        "an",
+        "is",
+        "are",
+        "what",
+        "say",
+        "do",
+        "was",
+        "that",
+    ]
 
     @classmethod
     def create(cls, client: Client):
         return cls(client=client)
 
-    def _get_relevant_documents(self, query: str) -> List[Document]:    
+    def _get_relevant_documents(self, query: str) -> List[Document]:
         words = []
         for word in query.split():
             if word.lower() not in self.drop_words:
@@ -43,7 +55,7 @@ def _get_relevant_documents(self, query: str) -> List[Document]:
         results = self.client.documents.search(
             query=cts_query,
             page_length=self.max_results,
-            collections=self.collections
+            collections=self.collections,
         )
         print(f"Count of matching MarkLogic documents: {len(results)}")
         return map(lambda doc: Document(page_content=doc.content), results)