Merge pull request #90 from BillFarber/task/extendExamples

BillFarber · web-flow · commit 65a8c47f9592 · 2024-09-13T08:27:01.000-04:00
Adding setup for MarkLogic 12 examples
diff --git a/examples/langchain/README.md b/examples/langchain/README.md
@@ -26,23 +26,31 @@ is available):
 
     docker-compose up -d --build
 
+## Deploy With Gradle
+
 Then deploy a small REST API application to MarkLogic, which includes a basic non-admin MarkLogic user 
 named `langchain-user`:
 
     ./gradlew -i mlDeploy
 
+## Install Python Libraries
+
 Next, create a new Python virtual environment - [pyenv](https://github.com/pyenv/pyenv) is recommended for this - 
 and install the 
 [langchain example dependencies](https://python.langchain.com/docs/use_cases/question_answering/quickstart#dependencies),
 along with the MarkLogic Python Client: 
 
     pip install -U langchain langchain_openai langchain-community langchainhub openai chromadb bs4 marklogic_python_client
 
+## Load Sample Data
+
 Then run the following Python program to load text data from the langchain quickstart guide 
 into two different collections in the `langchain-test-content` database:
 
     python load_data.py
 
+## Create Python Environment File
+
 Create a ".env" file to hold your AzureOpenAI environment values. It should look
 something like this.
 ```
@@ -89,4 +97,53 @@ query using the `marklogic_contextual_query_retriever.py` module in this project
 
 This retriever builds a term-query using words from the question. Then the term-query is
 added to the structured query and the merged query is used to select from the documents 
-loaded via `load_data.py`.
+loaded via `load_data.py`.
+
+## Testing using MarkLogic 12EA Vector Search
+
+### MarkLogic 12EA Setup
+
+To try out this functionality out, you will need acces to an instance of MarkLogic 12
+(currently internal or Early Access only). You may use docker 
+[docker-compose](https://docs.docker.com/compose/) to instantiate a new MarkLogic
+instance with port 8003 available (you can use your own MarkLogic instance too, just be
+sure that port 8003 is available):
+
+    docker-compose -f docker-compose-12.yml up -d --build
+
+### Deploy With Gradle
+
+You will also need to deploy the application. However, for this example, you will need
+to include an additional switch on the command line to deploy a TDE schema that takes
+advantage of the vector capabilities in MarkLogic 12.
+
+    ./gradlew -i mlDeploy -PmlSchemasPath=src/main/ml-schemas-12
+
+### Install Python Libraries
+
+As above, if you have not yet installed the Python libraries, install this with pip:
+```
+pip install -U langchain langchain_openai langchain-community langchainhub openai chromadb bs4 marklogic_python_client
+```
+
+### Create Python Environment File
+The Python script for this example also generates LLM embeddings and includes them in
+the documents stored in MarkLogic. In order to generate the embeddings, you'll need to
+add the following environment variables (with your values) to the .env file created
+above.
+
+```
+AZURE_EMBEDDING_DEPLOYMENT_NAME=text-test-embedding-ada-002
+AZURE_EMBEDDING_DEPLOYMENT_MODEL=text-embedding-ada-002
+```
+
+### Load Sample Data
+
+Then run the following Python program to load text data from the langchain quickstart
+guide into two different collections in the `langchain-test-content` database. Note that
+this script is different than the one in the earlier setup section and loads the data
+into different collections.
+
+```
+python load_data_with_embeddings.py
+```
diff --git a/examples/langchain/docker-compose-12.yml b/examples/langchain/docker-compose-12.yml
@@ -0,0 +1,17 @@
+version: '3.8'
+name: marklogic_python_example_langchain-12
+
+services:
+
+  marklogic:
+    image: "ml-docker-db-dev-tierpoint.bed-artifactory.bedford.progress.com/marklogic/marklogic-server-ubi:12.0.nightly-ubi-2.0.1"
+    platform: linux/amd64
+    environment:
+      - MARKLOGIC_INIT=true
+      - MARKLOGIC_ADMIN_USERNAME=admin
+      - MARKLOGIC_ADMIN_PASSWORD=admin
+    volumes:
+      - ./docker/marklogic/logs:/var/opt/MarkLogic/Logs
+    ports:
+      - "8000-8003:8000-8003"
+
diff --git a/examples/langchain/load_data_with_embeddings.py b/examples/langchain/load_data_with_embeddings.py
@@ -0,0 +1,78 @@
+# Based on example at
+# https://python.langchain.com/docs/use_cases/question_answering/quickstart .
+
+import bs4
+from dotenv import load_dotenv
+from langchain.document_loaders import WebBaseLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import AzureOpenAIEmbeddings
+from marklogic import Client
+from marklogic.documents import DefaultMetadata, Document
+import os
+
+load_dotenv()
+embeddings = AzureOpenAIEmbeddings(
+    azure_deployment=os.environ["AZURE_EMBEDDING_DEPLOYMENT_NAME"]
+)
+
+loader = WebBaseLoader(
+    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
+    bs_kwargs=dict(
+        parse_only=bs4.SoupStrainer(
+            class_=("post-content", "post-title", "post-header")
+        )
+    ),
+)
+docs = loader.load()
+
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000, chunk_overlap=100
+)
+splits = text_splitter.split_documents(docs)
+
+client = Client("http://localhost:8003", digest=("langchain-user", "password"))
+
+marklogic_docs = [DefaultMetadata(collections="posts_with_embeddings")]
+for split in splits:
+    text = split.page_content
+    embedding = embeddings.embed_query(text)
+    doc = Document(
+        None,
+        {"text": text, "embedding": embedding},
+        extension=".json",
+        directory="/post/",
+    )
+    marklogic_docs.append(doc)
+
+client.documents.write(marklogic_docs)
+print(
+    f"Number of documents written to collection 'posts': {len(marklogic_docs)-1}"
+)
+
+loader = WebBaseLoader(
+    web_paths=(
+        "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt",
+    )
+)
+docs = loader.load()
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000, chunk_overlap=100
+)
+splits = text_splitter.split_documents(docs)
+
+marklogic_docs = [DefaultMetadata(collections="sotu_with_embeddings")]
+for split in splits:
+    text = split.page_content
+    embedding = embeddings.embed_query(text)
+    doc = Document(
+        None,
+        {"text": text, "embedding": embedding},
+        extension=".json",
+        directory="/sotu/",
+    )
+    marklogic_docs.append(doc)
+
+client.documents.write(marklogic_docs)
+print(
+    f"Number of documents written to collection 'sotu': {len(marklogic_docs)-1}"
+)
diff --git a/examples/langchain/src/main/ml-config/databases/content-database.json b/examples/langchain/src/main/ml-config/databases/content-database.json
@@ -0,0 +1,4 @@
+{
+  "database-name": "%%DATABASE%%",
+  "schema-database": "%%SCHEMAS_DATABASE%%"
+}
diff --git a/examples/langchain/src/main/ml-config/databases/schemas-database.json b/examples/langchain/src/main/ml-config/databases/schemas-database.json
@@ -0,0 +1,3 @@
+{
+  "database-name": "%%SCHEMAS_DATABASE%%"
+}
diff --git a/examples/langchain/src/main/ml-schemas-12/tde/agents.json b/examples/langchain/src/main/ml-schemas-12/tde/agents.json
@@ -0,0 +1,35 @@
+{
+  "template": {
+    "context": "/",
+    "collections": [
+      "posts_with_embeddings"
+    ],
+    "rows": [
+      {
+        "schemaName": "demo",
+        "viewName": "posts",
+        "columns": [
+          {
+            "name": "uri",
+            "scalarType": "string",
+            "val": "xdmp:node-uri(.)"
+          },
+          {
+            "name": "embedding",
+            "scalarType": "vector",
+            "val": "vec:vector(embedding)",
+            "dimension": "1536",
+            "invalidValues": "reject",
+            "nullable": true
+          },
+          {
+            "name": "text",
+            "scalarType": "string",
+            "val": "text",
+            "nullable": true
+          }
+        ]
+      }
+    ]
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "database-name": "%%SCHEMAS_DATABASE%%"`
	`3`	`+}`