mongodb · shuangela · Mar 6, 2025 · May 29, 2024 · Jun 5, 2024 · Aug 16, 2024
diff --git a/.github/workflows/check-autobuilder.yml b/.github/workflows/check-autobuilder.yml
diff --git a/.github/workflows/vale-tdbx.yml b/.github/workflows/vale-tdbx.yml
@@ -12,23 +12,26 @@ jobs:
       - name: checkout
         uses: actions/checkout@master
 
+      - name: Install docutils
+        run: sudo apt-get install -y docutils
+
       - id: files
         uses: masesgroup/retrieve-changed-files@v2
         with:
-          format: 'csv'
+          format: "csv"
 
       - name: checkout-latest-rules
         uses: actions/checkout@master
         with:
           repository: mongodb/mongodb-vale-action
-          path: './tdbx-vale-rules'
+          path: "./tdbx-vale-rules"
           token: ${{secrets.GITHUB_TOKEN}}
 
       - name: move-files-for-vale-action
         run: |
-            cp tdbx-vale-rules/.vale.ini .vale.ini
-            mkdir -p .github/styles/
-            cp -rf tdbx-vale-rules/.github/styles/ .github/
+          cp tdbx-vale-rules/.vale.ini .vale.ini
+          mkdir -p .github/styles/
+          cp -rf tdbx-vale-rules/.github/styles/ .github/
 
       - name: run-vale
         uses: errata-ai/vale-action@reviewdog

diff --git a/build.sh b/build.sh
@@ -0,0 +1,7 @@
+# ensures that we always use the latest version of the script
+if [ -f build-site.sh ]; then
+  rm build-site.sh
+fi 
+
+curl https://raw.githubusercontent.com/mongodb/docs-worker-pool/netlify-poc/scripts/build-site.sh -o build-site.sh 
+sh build-site.sh
diff --git a/netlify.toml b/netlify.toml
@@ -0,0 +1,6 @@
+[[integrations]]
+name = "snooty-cache-plugin"
+
+[build]
+publish = "snooty/public"
+command = ". ./build.sh"
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -0,0 +1,7 @@
+{
+    "name": "docs-spark-connector",
+    "lockfileVersion": 3,
+    "requires": true,
+    "packages": {}
+  }
+
diff --git a/source/batch-mode.txt b/source/batch-mode.txt
@@ -10,8 +10,8 @@ Batch Mode
 
 .. toctree::
 
-   /batch-mode/batch-read
-   /batch-mode/batch-write
+   Read </batch-mode/batch-read>
+   Write </batch-mode/batch-write>
 
 Overview
 --------

diff --git a/source/batch-mode/batch-read-config.txt b/source/batch-mode/batch-read-config.txt
@@ -10,6 +10,13 @@ Batch Read Configuration Options
    :depth: 1
    :class: singlecol
 
+.. facet::
+   :name: genre
+   :values: reference
+
+.. meta::
+   :keywords: partitioner, customize, settings 
+
 .. _spark-batch-input-conf:
 
 Overview
@@ -107,12 +114,11 @@ You can configure the following properties when reading data from MongoDB in bat
 
           [{"$match": {"closed": false}}, {"$project": {"status": 1, "name": 1, "description": 1}}]
 
-       .. important::
-
-          Custom aggregation pipelines must be compatible with the
-          partitioner strategy. For example, aggregation stages such as
-          ``$group`` do not work with any partitioner that creates more than
-          one partition.
+       :gold:`IMPORTANT:` Custom aggregation pipelines must be
+       compatible with the partitioner strategy. For example,
+       aggregation stages such as
+       ``$group`` do not work with any partitioner that creates more
+       than one partition.
 
    * - ``aggregation.allowDiskUse``
      - | Specifies whether to allow storage to disk when running the
@@ -212,9 +218,12 @@ based on your shard configuration.
 To use this configuration, set the ``partitioner`` configuration option to
 ``com.mongodb.spark.sql.connector.read.partitioner.ShardedPartitioner``.
 
-.. warning::
-
-   This partitioner is not compatible with hashed shard keys.
+.. important:: ShardedPartitioner Restrictions
+
+   1. In MongoDB Server v6.0 and later, the sharding operation creates one large initial
+      chunk to cover all shard key values, making the sharded partitioner inefficient.
+      We do not recommend using the sharded partitioner when connected to MongoDB v6.0 and later.
+   2. The sharded partitioner is not compatible with hashed shard keys.
 
 .. _conf-mongopaginatebysizepartitioner:
 .. _conf-paginatebysizepartitioner:

diff --git a/source/batch-mode/batch-read.txt b/source/batch-mode/batch-read.txt
@@ -7,7 +7,7 @@ Read from MongoDB in Batch Mode
 .. toctree::
    :caption: Batch Read Configuration Options
 
-   /batch-mode/batch-read-config
+   Configuration </batch-mode/batch-read-config>
 
 .. contents:: On this page
    :local:

diff --git a/source/batch-mode/batch-write.txt b/source/batch-mode/batch-write.txt
@@ -7,7 +7,7 @@ Write to MongoDB in Batch Mode
 .. toctree::
    :caption: Batch Write Configuration Options
 
-   /batch-mode/batch-write-config
+   Configuration </batch-mode/batch-write-config>
 
 Overview
 --------
@@ -48,7 +48,7 @@ Overview
    - Time-series collections
 
    To learn more about save modes, see the
-   `Spark SQL Guide <https://spark.apache.org/docs/3.2.0/sql-data-sources-load-save-functions.html#save-modes>`__.
+   `Spark SQL Guide <https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html#save-modes>`__.
 
 .. important::
 

diff --git a/source/getting-started.txt b/source/getting-started.txt
@@ -45,6 +45,38 @@ Getting Started
 
          .. include:: /scala/api.rst
 
+Integrations
+------------
+
+The following sections describe some popular third-party platforms that you can
+integrate Spark and the {+connector-long+} with.
+
+Amazon EMR
+~~~~~~~~~~
+
+Amazon EMR is a managed cluster platform that you can use to run big data frameworks like Spark. To install Spark on an EMR cluster, see
+`Getting Started with Amazon EMR <https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-gs.html>`__ in the AWS documentation.
+
+Databricks
+~~~~~~~~~~
+
+Databricks is an analytics platform for building, deploying, and sharing enterprise-level data. To integrate the {+connector-long+} with Databricks,
+see `MongoDB <https://docs.databricks.com/aws/en/connect/external-systems/mongodb>`__ in the Databricks documentation.
+
+Docker
+~~~~~~
+
+Docker is an open-source platform that helps developers build, share, and run applications in containers. 
+
+- To start Spark in a Docker container, see `Apache Spark <https://hub.docker.com/r/apache/spark#!>`__ in the Docker documentation and follow the steps provided. 
+- To learn how to deploy Atlas on Docker, see `Create a Local Atlas Deployment with Docker <https://www.mongodb.com/docs/atlas/cli/current/atlas-cli-deploy-docker/>`__.
+
+Kubernetes
+~~~~~~~~~~
+
+Kubernetes is an open-source platform for automating containerization management. To run Spark on Kubernetes,
+see `Running Spark on Kubernetes <https://spark.apache.org/docs/3.5.4/running-on-kubernetes.html>`__ in the Spark documentation.
+
 Tutorials
 ---------
 

diff --git a/source/includes/data-source.rst b/source/includes/data-source.rst
diff --git a/source/includes/note-trigger-method.rst b/source/includes/note-trigger-method.rst
diff --git a/source/includes/scala-java-explicit-schema.rst b/source/includes/scala-java-explicit-schema.rst
diff --git a/source/index.txt b/source/index.txt
@@ -2,6 +2,18 @@
 MongoDB Connector for Spark
 ===========================
 
+.. toctree::
+   :titlesonly:
+
+   Get Started <getting-started>
+   Configure Spark <configuration>
+   Configure TLS/SSL <tls>
+   Batch Mode </batch-mode>
+   Streaming Mode </streaming-mode>
+   FAQ <faq>
+   Release Notes <release-notes>
+   API Documentation <api-docs>
+
 The `MongoDB Connector for Spark
 <https://www.mongodb.com/products/spark-connector>`_ provides
 integration between MongoDB and Apache Spark.
@@ -41,15 +53,3 @@ versions of Apache Spark and MongoDB:
    * - **{+current-version+}**
      - **3.1 through 3.5**
      - **4.0 or later**
-
-.. toctree::
-   :titlesonly:
-
-   Getting Started <getting-started>
-   configuration
-   tls
-   /batch-mode
-   /streaming-mode
-   faq
-   release-notes
-   api-docs
diff --git a/source/streaming-mode.txt b/source/streaming-mode.txt
@@ -12,8 +12,8 @@ Streaming Mode
 
 .. toctree::
 
-   /streaming-mode/streaming-read
-   /streaming-mode/streaming-write
+   Read </streaming-mode/streaming-read>
+   Write </streaming-mode/streaming-write>
 
 Overview
 --------

diff --git a/source/streaming-mode/streaming-read-config.txt b/source/streaming-mode/streaming-read-config.txt
@@ -82,12 +82,10 @@ You can configure the following properties when reading data from MongoDB in str
 
           [{"$match": {"closed": false}}, {"$project": {"status": 1, "name": 1, "description": 1}}]
 
-       .. important::
-
-          Custom aggregation pipelines must be compatible with the
-          partitioner strategy. For example, aggregation stages such as
-          ``$group`` do not work with any partitioner that creates more than
-          one partition.
+       Custom aggregation pipelines must be compatible with the
+       partitioner strategy. For example, aggregation stages such as
+       ``$group`` do not work with any partitioner that creates more than
+       one partition.
 
    * - ``aggregation.allowDiskUse``
      - | Specifies whether to allow storage to disk when running the
@@ -135,14 +133,12 @@ You can configure the following properties when reading a change stream from Mon
        original document and updated document, but it also includes a copy of the
        entire updated document.
 
+       For more information on how this change stream option works,
+       see the MongoDB server manual guide
+       :manual:`Lookup Full Document for Update Operation </changeStreams/#lookup-full-document-for-update-operations>`.
+
        **Default:** "default"
 
-       .. tip::
-
-          For more information on how this change stream option works,
-          see the MongoDB server manual guide
-          :manual:`Lookup Full Document for Update Operation </changeStreams/#lookup-full-document-for-update-operations>`.
-
    * - ``change.stream.micro.batch.max.partition.count``
      - | The maximum number of partitions the {+connector-short+} divides each 
          micro-batch into. Spark workers can process these partitions in parallel.
@@ -151,11 +147,9 @@ You can configure the following properties when reading a change stream from Mon
        |
        | **Default**: ``1``
 
-       .. warning:: Event Order
-
-          Specifying a value larger than ``1`` can alter the order in which
-          the {+connector-short+} processes change events. Avoid this setting
-          if out-of-order processing could create data inconsistencies downstream. 
+       :red:`WARNING:` Specifying a value larger than ``1`` can alter the order in which
+       the {+connector-short+} processes change events. Avoid this setting
+       if out-of-order processing could create data inconsistencies downstream. 
 
    * - ``change.stream.publish.full.document.only``
      - | Specifies whether to publish the changed document or the full
@@ -174,12 +168,10 @@ You can configure the following properties when reading a change stream from Mon
        - If you don't specify a schema, the connector infers the schema
          from the change stream document.
 
-       **Default**: ``false``
+       This setting overrides the ``change.stream.lookup.full.document``
+       setting.
 
-       .. note::
-
-          This setting overrides the ``change.stream.lookup.full.document``
-          setting.
+       **Default**: ``false``
 
    * - ``change.stream.startup.mode``
      - | Specifies how the connector starts up when no offset is available.

diff --git a/source/streaming-mode/streaming-read.txt b/source/streaming-mode/streaming-read.txt
@@ -7,7 +7,7 @@ Read from MongoDB in Streaming Mode
 .. toctree::
    :caption: Streaming Read Configuration Options
 
-   /streaming-mode/streaming-read-config
+   Configuration </streaming-mode/streaming-read-config>
 
 .. contents:: On this page
    :local: