datajuicer
diff --git a/‎.github/workflows/deploy_sphinx_docs.yml‎
Lines changed: 33 additions & 2 deletions b/‎.github/workflows/deploy_sphinx_docs.yml‎
Lines changed: 33 additions & 2 deletions
diff --git a/‎.pre-commit-hooks/build_op_doc.py‎
Lines changed: 4 additions & 0 deletions b/‎.pre-commit-hooks/build_op_doc.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎data_juicer/core/data/load_strategy.py‎
Lines changed: 26 additions & 10 deletions b/‎data_juicer/core/data/load_strategy.py‎
Lines changed: 26 additions & 10 deletions
diff --git a/‎data_juicer/ops/deduplicator/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/ops/deduplicator/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data_juicer/ops/deduplicator/minhash.cpp‎
Lines changed: 195 additions & 0 deletions b/‎data_juicer/ops/deduplicator/minhash.cpp‎
Lines changed: 195 additions & 0 deletions
@@ -23,23 +23,48 @@ jobs:
       REPO_OWNER: ${{ github.repository_owner }}
       MIN_TAG: v1.4.0
     steps:
+      - name: Mount /mnt into workspace (writable)
+        run: |
+          set -euxo pipefail
+          sudo mkdir -p /mnt/repo
+          sudo chown -R "$USER:$USER" /mnt/repo
+          mkdir -p "$GITHUB_WORKSPACE/repo"
+          sudo mount --bind /mnt/repo "$GITHUB_WORKSPACE/repo"
+          sudo chown -R "$USER:$USER" "$GITHUB_WORKSPACE/repo"
+          ls -ld /mnt/repo "$GITHUB_WORKSPACE/repo"
       - name: Checkout
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
+          path: repo
       - name: Setup Python ${{ matrix.python-version }}
         uses: actions/setup-python@master
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Free disk space
+        run: |
+          sudo swapoff -a
+          sudo rm -f /swapfile
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get autoclean -y >/dev/null 2>&1
+          sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
+          sudo apt clean
+          df -h
       - name: Install uv
         uses: astral-sh/setup-uv@v7
         with:
           enable-cache: true
       - name: Install dependencies with uv
+        working-directory: repo
         run: |
           uv pip install --system --upgrade pip
           uv pip install --system -e .[all]
+      - name: Check disk
+        run: |
+          set -euxo pipefail
+          df -h
       - name: Fetch Data-Juicer Sphinx Template
+        working-directory: repo
         run: |
           set -e
           echo "Cloning sphinx template..."
@@ -57,31 +82,37 @@ jobs:
           echo "Restoring custom files..."
           cp -rf /tmp/custom_files/source/* docs/sphinx_doc/source
           echo "Done!"
+          df -h
       - name: Get git tags
+        working-directory: repo
         run: |
           git fetch --all --tags
           git branch -a
           git tag
       - id: build
         name: Build Documentation
+        working-directory: repo
         run: |
           cd docs/sphinx_doc
           python build_versions.py --tags
+          df -h
       - name: Redirect index.html
+        working-directory: repo
         run: |
           REPOSITORY_OWNER="${GITHUB_REPOSITORY_OWNER}"
           cd docs/sphinx_doc
           cp ./redirect.html build/index.html
           sed -i "s/\[REPOSITORY_OWNER\]/${REPOSITORY_OWNER}/g" build/index.html
           sed -i "s/\[PROJECT\]/${PROJECT}/g" build/index.html
           cp build/index.html build/404.html
+          df -h
       - name: Upload Documentation
         uses: actions/upload-artifact@v4
         with:
           name: SphinxDoc
-          path: "docs/sphinx_doc/build"
+          path: "repo/docs/sphinx_doc/build"
       - uses: peaceiris/actions-gh-pages@v3
         if: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) }}
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: "docs/sphinx_doc/build"
+          publish_dir: "repo/docs/sphinx_doc/build"
@@ -316,6 +316,8 @@ def get_op_list_from_code_for_formatter():
             test_path = os.path.join(FORMATTER_TEST_PREFIX, f"test_{formatter}")
             if os.path.isdir(code_path):
                 continue
+            if "_cpp" in code_path:
+                continue
             docstrings = get_class_and_docstring(code_path)
             _, doc = docstrings[0]
             op_record_list.append(
@@ -351,6 +353,8 @@ def get_op_list_from_code():
             test_path = os.path.join(OP_TEST_PREFIX, type, f"test_{op}")
             if os.path.isdir(code_path):
                 continue
+            if not code_path.endswith(".py") or "_cpp" in code_path:
+                continue
             docstrings = get_class_and_docstring(code_path)
             _, doc = docstrings[0]
             info = info_link(op.replace(".py", ""))
 
@@ -558,6 +558,7 @@ class RayS3DataLoadStrategy(RayDataLoadStrategy):
             "aws_session_token",
             "aws_region",
             "endpoint_url",
+            "format",
         ],
         "field_types": {"path": str},
         "custom_validators": {
@@ -590,23 +591,38 @@ def load_data(self, **kwargs):
         }
 
         auto_detect = False
-        data_source = self.ds_config.get("source", None)
-        if data_source is None:
+        data_format = self.ds_config.get("format", None)
+        if data_format is None:
             auto_detect = True
         else:
-            suffix = os.path.splitext(data_source)[1]
-            if suffix in file_extension_map:
-                data_format = file_extension_map[suffix]
-            elif "." + data_source in file_extension_map:
-                data_format = file_extension_map["." + data_source]
+            # First check if it's already a valid format name
+            valid_formats = set(file_extension_map.values())
+            if data_format in valid_formats:
+                pass  # It's a valid format name, use it as is
             else:
-                auto_detect = True
+                # Try to interpret as an extension or filename
+                suffix = os.path.splitext(data_format)[1]
+                if suffix in file_extension_map:
+                    data_format = file_extension_map[suffix]
+                elif "." + data_format in file_extension_map:
+                    data_format = file_extension_map["." + data_format]
+                else:
+                    auto_detect = True
 
         if auto_detect:
             # Extract extension from path
             file_extension = os.path.splitext(path)[1]
-            data_format = file_extension_map.get(file_extension, "parquet")  # Default to parquet for S3
-            logger.info(f"Auto-detected data format: {data_format}")
+            if file_extension in file_extension_map:
+                data_format = file_extension_map[file_extension]
+                logger.info(f"Auto-detected data format: {data_format} from extension: {file_extension}")
+            else:
+                data_format = "parquet"
+                logger.warning(
+                    f"Could not determine data format from path '{path}' "
+                    f"(extension: '{file_extension or '(none)'}'), "
+                    f"defaulting to 'parquet'. "
+                    f"Consider explicitly specifying 'format' field in dataset config."
+                )
         else:
             logger.info(f"Using specified data format: {data_format}")
 
 
@@ -6,6 +6,7 @@
 from .document_simhash_deduplicator import DocumentSimhashDeduplicator
 from .image_deduplicator import ImageDeduplicator
 from .ray_basic_deduplicator import RayBasicDeduplicator
+from .ray_bts_minhash_cpp_deduplicator import RayBTSMinhashCppDeduplicator
 from .ray_bts_minhash_deduplicator import (
     RayBTSMinhashDeduplicator,
     RayBTSMinhashDeduplicatorWithUid,
@@ -27,5 +28,6 @@
     "RayVideoDeduplicator",
     "RayBTSMinhashDeduplicator",
     "RayBTSMinhashDeduplicatorWithUid",
+    "RayBTSMinhashCppDeduplicator",
     "VideoDeduplicator",
 ]
@@ -0,0 +1,195 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include <vector>
+#include <cstdint>
+#include <algorithm>
+#include <omp.h>
+
+namespace py = pybind11;
+
+// Constants
+const uint32_t MERSENNE_PRIME = 2147483647;  // 2^31 - 1
+const uint32_t MAX_HASH = 4294967295;  // 2^32 - 1
+
+uint32_t simple_hash(const std::string& token) {
+    uint32_t hash = 5381;
+    for (const uint8_t c : token) {
+        hash = ((hash << 5) + hash) + c;
+    }
+    return hash;
+}
+
+std::vector<std::tuple<uint32_t, py::bytes, uint64_t>> calc_minhash_c(
+    const std::vector<std::string>& tokens,
+    const py::array_t<uint32_t>& perm_a,
+    const py::array_t<uint32_t>& perm_b,
+    const py::bytes& empty_hash_value,
+    const std::vector<std::pair<size_t, size_t>>& hash_ranges,
+    uint32_t union_find_parallel_num,
+    uint64_t uid)
+{
+    std::vector<std::tuple<uint32_t, py::bytes, uint64_t>> pairs;
+
+    if (tokens.empty()) {
+        pairs.emplace_back(MAX_HASH % union_find_parallel_num, empty_hash_value, uid);
+        return pairs;
+    }
+
+    std::vector<uint32_t> hv;
+    hv.reserve(tokens.size());
+    for (const std::string& token : tokens) {
+        hv.push_back(simple_hash(token));
+    }
+
+    auto perm_a_data = perm_a.unchecked<1>();
+    auto perm_b_data = perm_b.unchecked<1>();
+    size_t num_permutation = perm_a.shape(0);
+
+    std::vector<uint32_t> hash_values(num_permutation, MAX_HASH);
+    for (size_t i = 0; i < num_permutation; ++i) {
+        for (uint32_t h : hv) {
+            uint32_t phv = ((static_cast<uint64_t>(h) * perm_a_data(i) + perm_b_data(i)) % MERSENNE_PRIME) & MAX_HASH;
+            hash_values[i] = std::min(hash_values[i], phv);
+        }
+    }
+
+
+    for (size_t i = 0; i < hash_ranges.size(); ++i) {
+        const auto& [start, end] = hash_ranges[i];
+        std::vector<uint32_t> band_hash_values(hash_values.begin() + start, hash_values.begin() + end);
+
+        py::bytes hash_value = py::bytes(
+            std::string(reinterpret_cast<char*>(&i), sizeof(uint32_t)) +
+            std::string(reinterpret_cast<char*>(band_hash_values.data()), band_hash_values.size() * sizeof(uint32_t))
+        );
+
+        uint32_t hash_table_id = hash_values[start] % union_find_parallel_num;
+        pairs.emplace_back(hash_table_id, hash_value, uid);
+    }
+
+    return pairs;
+}
+
+py::list calc_minhash_batch_c(
+    const std::vector<std::vector<std::string>>& tokens_list,
+    const uint64_t uid_begin,
+    const std::vector<uint64_t>& perm_a,
+    const std::vector<uint64_t>& perm_b,
+    const std::string& empty_hash_value,
+    const std::vector<std::pair<size_t, size_t>>& hash_ranges,
+    uint32_t union_find_parallel_num,
+    uint32_t num_threads)
+{
+    omp_set_num_threads(num_threads);
+    size_t total_docs = tokens_list.size();
+    std::vector<std::tuple<uint32_t, std::string, uint64_t>> intermediate_pairs;
+    intermediate_pairs.reserve(total_docs * hash_ranges.size());
+
+    size_t num_permutation = perm_a.size();
+
+    #pragma omp parallel
+    {
+        std::vector<std::tuple<uint32_t, std::string, uint64_t>> local_pairs;
+        local_pairs.reserve(total_docs * hash_ranges.size() / num_threads);
+        std::vector<uint32_t> hash_values(num_permutation);
+
+        #pragma omp for nowait
+        for (size_t doc_idx = 0; doc_idx < total_docs; ++doc_idx) {
+            const auto& tokens = tokens_list[doc_idx];
+            uint64_t uid = uid_begin + doc_idx;
+
+            if (tokens.empty()) {
+                local_pairs.emplace_back(MAX_HASH % union_find_parallel_num, empty_hash_value, uid);
+                continue;
+            }
+
+            std::fill(hash_values.begin(), hash_values.end(), MAX_HASH);
+            for (const auto& token : tokens) {
+                uint32_t h = simple_hash(token);
+                for (size_t i = 0; i < num_permutation; ++i) {
+                    uint32_t phv = (static_cast<uint64_t>(h) * perm_a[i] + perm_b[i]) >> 32;
+                    hash_values[i] = std::min(hash_values[i], phv);
+                }
+            }
+
+            for (size_t i = 0; i < hash_ranges.size(); ++i) {
+                const auto& [start, end] = hash_ranges[i];
+                std::string hash_value(reinterpret_cast<char*>(&i), sizeof(uint32_t));
+                hash_value.append(reinterpret_cast<char*>(&hash_values[start]), (end - start) * sizeof(uint32_t));
+
+                uint32_t hash_table_id = hash_values[start] % union_find_parallel_num;
+                local_pairs.emplace_back(hash_table_id, std::move(hash_value), uid);
+            }
+        }
+
+        #pragma omp critical
+        {
+            intermediate_pairs.insert(intermediate_pairs.end(), local_pairs.begin(), local_pairs.end());
+        }
+    }
+    py::list result;
+    for (const auto& item : intermediate_pairs) {
+        uint32_t first = std::get<0>(item);
+        py::bytes second = py::bytes(std::get<1>(item));
+        uint64_t third = std::get<2>(item);
+        result.append(py::make_tuple(first, second, third));
+    }
+    return result;
+}
+
+std::vector<std::tuple<uint32_t, py::bytes>> calc_simple_minhash_c(
+    const std::vector<std::string>& tokens,
+    const py::array_t<uint32_t>& perm_a,
+    const py::array_t<uint32_t>& perm_b,
+    const std::vector<std::pair<size_t, size_t>>& hash_ranges,
+    uint32_t bucket_per_band,
+    uint64_t uid)
+{
+    std::vector<std::tuple<uint32_t, py::bytes>> pairs;
+
+    if (tokens.empty()) {
+        pairs.emplace_back(0, py::bytes(""));
+        return pairs;
+    }
+
+    std::vector<uint32_t> hv;
+    hv.reserve(tokens.size());
+    for (const std::string& token : tokens) {
+        hv.push_back(simple_hash(token));
+    }
+
+    auto perm_a_data = perm_a.unchecked<1>();
+    auto perm_b_data = perm_b.unchecked<1>();
+    size_t num_permutation = perm_a.shape(0);
+
+    std::vector<uint32_t> hash_values(num_permutation, MAX_HASH);
+    for (size_t i = 0; i < num_permutation; ++i) {
+        for (uint32_t h : hv) {
+            uint32_t phv = ((static_cast<uint64_t>(h) * perm_a_data(i) + perm_b_data(i)) % MERSENNE_PRIME) & MAX_HASH;
+            hash_values[i] = std::min(hash_values[i], phv);
+        }
+    }
+
+
+    for (size_t i = 0; i < hash_ranges.size(); ++i) {
+        const auto& [start, end] = hash_ranges[i];
+        std::vector<uint32_t> band_hash_values(hash_values.begin() + start, hash_values.begin() + end);
+
+        py::bytes hash_value = py::bytes(
+            std::string(reinterpret_cast<char*>(band_hash_values.data()), band_hash_values.size() * sizeof(uint32_t))
+        );
+
+        uint32_t hash_table_id = bucket_per_band * i + (hash_values[start] % bucket_per_band);
+        pairs.emplace_back(hash_table_id, hash_value);
+    }
+
+    return pairs;
+}
+
+
+PYBIND11_MODULE(minhash, m) {
+    m.def("calc_minhash_c", &calc_minhash_c, "C++ implementation of calc_minhash");
+    m.def("calc_simple_minhash_c", &calc_simple_minhash_c, "C++ implementation of calc_simple_minhash");
+    m.def("calc_minhash_batch_c", &calc_minhash_batch_c, "C++ implementation of calc_minhash (batch version)");
+}