Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove lingering DASK_DATAFRAME__QUERY_PLANNING environment variables #346

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions tutorials/image-curation/image-curation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@
"source": [
"!pip install cython ipywidgets aiofiles\n",
"# Install from source by default\n",
"!pip install --extra-index-url https://pypi.nvidia.com ../../[image]\n",
"%env DASK_DATAFRAME__QUERY_PLANNING False"
"!pip install --extra-index-url https://pypi.nvidia.com ../../[image]"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ export CUDF_SPILL="1"
export RMM_SCHEDULER_POOL_SIZE="1GB"
export RMM_WORKER_POOL_SIZE="72GiB"
export LIBCUDF_CUFILE_POLICY=OFF
export DASK_DATAFRAME__QUERY_PLANNING=False


# =================================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,10 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import nemo_curator\n",
"from dask.distributed import Client, LocalCluster\n",
"\n",
"# Start a Dask cluster with 12 workers, each limited at 64GB of memory. \n",
Expand Down Expand Up @@ -708,17 +707,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"DASK_DATAFRAME__QUERY_PLANNING\"] = \"False\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -734,7 +723,6 @@
],
"source": [
"from nemo_curator.utils.distributed_utils import get_client\n",
"import dask.dataframe\n",
"\n",
"def pre_imports():\n",
" import cudf \n",
Expand Down Expand Up @@ -1030,13 +1018,12 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from datasets import load_dataset as load_hf_dataset\n",
"from datasets import DownloadConfig "
"from datasets import load_dataset as load_hf_dataset"
]
},
{
Expand Down Expand Up @@ -1133,14 +1120,12 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nemo_curator import Modify\n",
"from nemo_curator.modifiers import UnicodeReformatter\n",
"from nemo_curator.utils.distributed_utils import read_data, write_to_disk\n",
"from nemo_curator.utils.file_utils import get_all_files_paths_under\n",
"from nemo_curator.utils.distributed_utils import write_to_disk\n",
"from nemo_curator.datasets import DocumentDataset"
]
},
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/0_processing/process_dclm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

from dask.distributed import Client, LocalCluster
from helper import process_data
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/0_processing/process_dolma_cc.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

from dask.distributed import Client, LocalCluster
from helper import process_data
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/0_processing/process_fwe2.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import ctypes
import gc
import logging
import os
from pathlib import Path

from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/0_processing/process_zyda.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

from dask.distributed import Client, LocalCluster
from helper import process_data
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os
import time

import dask_cudf
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/1_fuzzy_dedup/1_lsh.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os
import time

import cudf
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/1_fuzzy_dedup/2_buckets_to_edges.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os
import time

import dask_cudf
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os
import time

from nemo_curator.modules.fuzzy_dedup import ConnectedComponents
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/0_id_mapping.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import json
import logging
import os

import cudf
import dask_cudf
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/1_id_conversion.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/2_compute_counts.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import json
import logging
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging

import dask.dataframe as dd
import pandas as pd
from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/3_prep_dupes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
import pandas as pd
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dclm.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/5_get_dupes_zyda.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/remove_dupes.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import argparse
import json
import logging
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import argparse
import logging
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging

from nemo_curator.classifiers import QualityClassifier
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import get_client, get_num_workers
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/4_filtering/filter_fwe.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import logging
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging

import dask.dataframe as dd
import pyarrow as pa
from dask.distributed import Client, LocalCluster
Expand Down
6 changes: 1 addition & 5 deletions tutorials/zyda2-tutorial/4_filtering/filter_quality.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import argparse
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import time

import dask.dataframe as dd
import pyarrow as pa
Expand Down