diff --git a/.github/workflows/check-autobuilder.yml b/.github/workflows/check-autobuilder.yml deleted file mode 100644 index 8495db96..00000000 --- a/.github/workflows/check-autobuilder.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: Check Autobuilder for Errors - -on: - pull_request: - paths: - - "source/**" - -jobs: - check: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: cbush/snooty-autobuilder-check@main diff --git a/.github/workflows/vale-tdbx.yml b/.github/workflows/vale-tdbx.yml index 284033ab..d748e941 100644 --- a/.github/workflows/vale-tdbx.yml +++ b/.github/workflows/vale-tdbx.yml @@ -12,23 +12,26 @@ jobs: - name: checkout uses: actions/checkout@master + - name: Install docutils + run: sudo apt-get install -y docutils + - id: files uses: masesgroup/retrieve-changed-files@v2 with: - format: 'csv' + format: "csv" - name: checkout-latest-rules uses: actions/checkout@master with: repository: mongodb/mongodb-vale-action - path: './tdbx-vale-rules' + path: "./tdbx-vale-rules" token: ${{secrets.GITHUB_TOKEN}} - name: move-files-for-vale-action run: | - cp tdbx-vale-rules/.vale.ini .vale.ini - mkdir -p .github/styles/ - cp -rf tdbx-vale-rules/.github/styles/ .github/ + cp tdbx-vale-rules/.vale.ini .vale.ini + mkdir -p .github/styles/ + cp -rf tdbx-vale-rules/.github/styles/ .github/ - name: run-vale uses: errata-ai/vale-action@reviewdog diff --git a/build.sh b/build.sh new file mode 100644 index 00000000..a5e15032 --- /dev/null +++ b/build.sh @@ -0,0 +1,7 @@ +# ensures that we always use the latest version of the script +if [ -f build-site.sh ]; then + rm build-site.sh +fi + +curl https://raw.githubusercontent.com/mongodb/docs-worker-pool/netlify-poc/scripts/build-site.sh -o build-site.sh +sh build-site.sh diff --git a/netlify.toml b/netlify.toml new file mode 100644 index 00000000..d0c89040 --- /dev/null +++ b/netlify.toml @@ -0,0 +1,6 @@ +[[integrations]] +name = "snooty-cache-plugin" + +[build] +publish = "snooty/public" +command = ". ./build.sh" diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000..691bb46c --- /dev/null +++ b/package-lock.json @@ -0,0 +1,10 @@ +{ + "name": "docs-spark-connector", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "docs-spark-connector" + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 00000000..9f6b7cdb --- /dev/null +++ b/package.json @@ -0,0 +1,7 @@ +{ + "name": "docs-spark-connector", + "lockfileVersion": 3, + "requires": true, + "packages": {} + } + \ No newline at end of file diff --git a/source/batch-mode.txt b/source/batch-mode.txt index 5f5119a2..a48a84d0 100644 --- a/source/batch-mode.txt +++ b/source/batch-mode.txt @@ -10,8 +10,8 @@ Batch Mode .. toctree:: - /batch-mode/batch-read - /batch-mode/batch-write + Read + Write Overview -------- diff --git a/source/batch-mode/batch-read-config.txt b/source/batch-mode/batch-read-config.txt index 7233fb2f..18bd940f 100644 --- a/source/batch-mode/batch-read-config.txt +++ b/source/batch-mode/batch-read-config.txt @@ -10,6 +10,13 @@ Batch Read Configuration Options :depth: 1 :class: singlecol +.. facet:: + :name: genre + :values: reference + +.. meta:: + :keywords: partitioner, customize, settings + .. _spark-batch-input-conf: Overview @@ -107,12 +114,11 @@ You can configure the following properties when reading data from MongoDB in bat [{"$match": {"closed": false}}, {"$project": {"status": 1, "name": 1, "description": 1}}] - .. important:: - - Custom aggregation pipelines must be compatible with the - partitioner strategy. For example, aggregation stages such as - ``$group`` do not work with any partitioner that creates more than - one partition. + :gold:`IMPORTANT:` Custom aggregation pipelines must be + compatible with the partitioner strategy. For example, + aggregation stages such as + ``$group`` do not work with any partitioner that creates more + than one partition. * - ``aggregation.allowDiskUse`` - | Specifies whether to allow storage to disk when running the @@ -212,9 +218,12 @@ based on your shard configuration. To use this configuration, set the ``partitioner`` configuration option to ``com.mongodb.spark.sql.connector.read.partitioner.ShardedPartitioner``. -.. warning:: - - This partitioner is not compatible with hashed shard keys. +.. important:: ShardedPartitioner Restrictions + + 1. In MongoDB Server v6.0 and later, the sharding operation creates one large initial + chunk to cover all shard key values, making the sharded partitioner inefficient. + We do not recommend using the sharded partitioner when connected to MongoDB v6.0 and later. + 2. The sharded partitioner is not compatible with hashed shard keys. .. _conf-mongopaginatebysizepartitioner: .. _conf-paginatebysizepartitioner: diff --git a/source/batch-mode/batch-read.txt b/source/batch-mode/batch-read.txt index bc59ba90..ab636063 100644 --- a/source/batch-mode/batch-read.txt +++ b/source/batch-mode/batch-read.txt @@ -7,7 +7,7 @@ Read from MongoDB in Batch Mode .. toctree:: :caption: Batch Read Configuration Options - /batch-mode/batch-read-config + Configuration .. contents:: On this page :local: diff --git a/source/batch-mode/batch-write.txt b/source/batch-mode/batch-write.txt index c1fc1e03..e4dce8ad 100644 --- a/source/batch-mode/batch-write.txt +++ b/source/batch-mode/batch-write.txt @@ -7,7 +7,7 @@ Write to MongoDB in Batch Mode .. toctree:: :caption: Batch Write Configuration Options - /batch-mode/batch-write-config + Configuration Overview -------- @@ -48,7 +48,7 @@ Overview - Time-series collections To learn more about save modes, see the - `Spark SQL Guide `__. + `Spark SQL Guide `__. .. important:: diff --git a/source/getting-started.txt b/source/getting-started.txt index 0a16beea..e157df84 100644 --- a/source/getting-started.txt +++ b/source/getting-started.txt @@ -45,6 +45,38 @@ Getting Started .. include:: /scala/api.rst +Integrations +------------ + +The following sections describe some popular third-party platforms that you can +integrate Spark and the {+connector-long+} with. + +Amazon EMR +~~~~~~~~~~ + +Amazon EMR is a managed cluster platform that you can use to run big data frameworks like Spark. To install Spark on an EMR cluster, see +`Getting Started with Amazon EMR `__ in the AWS documentation. + +Databricks +~~~~~~~~~~ + +Databricks is an analytics platform for building, deploying, and sharing enterprise-level data. To integrate the {+connector-long+} with Databricks, +see `MongoDB `__ in the Databricks documentation. + +Docker +~~~~~~ + +Docker is an open-source platform that helps developers build, share, and run applications in containers. + +- To start Spark in a Docker container, see `Apache Spark `__ in the Docker documentation and follow the steps provided. +- To learn how to deploy Atlas on Docker, see `Create a Local Atlas Deployment with Docker `__. + +Kubernetes +~~~~~~~~~~ + +Kubernetes is an open-source platform for automating containerization management. To run Spark on Kubernetes, +see `Running Spark on Kubernetes `__ in the Spark documentation. + Tutorials --------- diff --git a/source/includes/data-source.rst b/source/includes/data-source.rst deleted file mode 100644 index 2f18028e..00000000 --- a/source/includes/data-source.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. note:: - - The empty argument ("") refers to a file to use as a data source. - In this case our data source is a MongoDB collection, so the data - source argument is empty. \ No newline at end of file diff --git a/source/includes/note-trigger-method.rst b/source/includes/note-trigger-method.rst deleted file mode 100644 index f9ad2d1d..00000000 --- a/source/includes/note-trigger-method.rst +++ /dev/null @@ -1,4 +0,0 @@ -.. note:: - - Call the ``trigger()`` method on the ``DataStreamWriter`` you create - from the ``DataStreamReader`` you configure. diff --git a/source/includes/scala-java-explicit-schema.rst b/source/includes/scala-java-explicit-schema.rst deleted file mode 100644 index 3b682cb1..00000000 --- a/source/includes/scala-java-explicit-schema.rst +++ /dev/null @@ -1,13 +0,0 @@ -By default, reading from MongoDB in a ``SparkSession`` infers the -schema by sampling documents from the collection. You can also use a -|class| to define the schema explicitly, thus removing the extra -queries needed for sampling. - -.. note:: - - If you provide a case class for the schema, MongoDB returns **only - the declared fields**. This helps minimize the data sent across the - wire. - -The following statement creates a ``Character`` |class| and then -uses it to define the schema for the DataFrame: diff --git a/source/index.txt b/source/index.txt index 1bad808c..9fed2ac5 100644 --- a/source/index.txt +++ b/source/index.txt @@ -2,6 +2,18 @@ MongoDB Connector for Spark =========================== +.. toctree:: + :titlesonly: + + Get Started + Configure Spark + Configure TLS/SSL + Batch Mode + Streaming Mode + FAQ + Release Notes + API Documentation + The `MongoDB Connector for Spark `_ provides integration between MongoDB and Apache Spark. @@ -41,15 +53,3 @@ versions of Apache Spark and MongoDB: * - **{+current-version+}** - **3.1 through 3.5** - **4.0 or later** - -.. toctree:: - :titlesonly: - - Getting Started - configuration - tls - /batch-mode - /streaming-mode - faq - release-notes - api-docs diff --git a/source/streaming-mode.txt b/source/streaming-mode.txt index 456695f6..9128ef92 100644 --- a/source/streaming-mode.txt +++ b/source/streaming-mode.txt @@ -12,8 +12,8 @@ Streaming Mode .. toctree:: - /streaming-mode/streaming-read - /streaming-mode/streaming-write + Read + Write Overview -------- diff --git a/source/streaming-mode/streaming-read-config.txt b/source/streaming-mode/streaming-read-config.txt index 997d175d..dd185fe1 100644 --- a/source/streaming-mode/streaming-read-config.txt +++ b/source/streaming-mode/streaming-read-config.txt @@ -82,12 +82,10 @@ You can configure the following properties when reading data from MongoDB in str [{"$match": {"closed": false}}, {"$project": {"status": 1, "name": 1, "description": 1}}] - .. important:: - - Custom aggregation pipelines must be compatible with the - partitioner strategy. For example, aggregation stages such as - ``$group`` do not work with any partitioner that creates more than - one partition. + Custom aggregation pipelines must be compatible with the + partitioner strategy. For example, aggregation stages such as + ``$group`` do not work with any partitioner that creates more than + one partition. * - ``aggregation.allowDiskUse`` - | Specifies whether to allow storage to disk when running the @@ -135,14 +133,12 @@ You can configure the following properties when reading a change stream from Mon original document and updated document, but it also includes a copy of the entire updated document. + For more information on how this change stream option works, + see the MongoDB server manual guide + :manual:`Lookup Full Document for Update Operation `. + **Default:** "default" - .. tip:: - - For more information on how this change stream option works, - see the MongoDB server manual guide - :manual:`Lookup Full Document for Update Operation `. - * - ``change.stream.micro.batch.max.partition.count`` - | The maximum number of partitions the {+connector-short+} divides each micro-batch into. Spark workers can process these partitions in parallel. @@ -151,11 +147,9 @@ You can configure the following properties when reading a change stream from Mon | | **Default**: ``1`` - .. warning:: Event Order - - Specifying a value larger than ``1`` can alter the order in which - the {+connector-short+} processes change events. Avoid this setting - if out-of-order processing could create data inconsistencies downstream. + :red:`WARNING:` Specifying a value larger than ``1`` can alter the order in which + the {+connector-short+} processes change events. Avoid this setting + if out-of-order processing could create data inconsistencies downstream. * - ``change.stream.publish.full.document.only`` - | Specifies whether to publish the changed document or the full @@ -174,12 +168,10 @@ You can configure the following properties when reading a change stream from Mon - If you don't specify a schema, the connector infers the schema from the change stream document. - **Default**: ``false`` + This setting overrides the ``change.stream.lookup.full.document`` + setting. - .. note:: - - This setting overrides the ``change.stream.lookup.full.document`` - setting. + **Default**: ``false`` * - ``change.stream.startup.mode`` - | Specifies how the connector starts up when no offset is available. diff --git a/source/streaming-mode/streaming-read.txt b/source/streaming-mode/streaming-read.txt index ac8fb7ba..4c50febe 100644 --- a/source/streaming-mode/streaming-read.txt +++ b/source/streaming-mode/streaming-read.txt @@ -7,7 +7,7 @@ Read from MongoDB in Streaming Mode .. toctree:: :caption: Streaming Read Configuration Options - /streaming-mode/streaming-read-config + Configuration .. contents:: On this page :local: diff --git a/source/streaming-mode/streaming-write.txt b/source/streaming-mode/streaming-write.txt index 60a6aa3f..854ca917 100644 --- a/source/streaming-mode/streaming-write.txt +++ b/source/streaming-mode/streaming-write.txt @@ -7,7 +7,7 @@ Write to MongoDB in Streaming Mode .. toctree:: :caption: Streaming Write Configuration Options - /streaming-mode/streaming-write-config + Configuration .. tabs-drivers:: @@ -51,7 +51,8 @@ Write to MongoDB in Streaming Mode * - ``writeStream.trigger()`` - Specifies how often the {+connector-short+} writes results - to the streaming sink. + to the streaming sink. Call this method on the ``DataStreamWriter`` object + you create from the ``DataStreamReader`` you configure. To use continuous processing, pass ``Trigger.Continuous(