Rostlab · SebieF · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/README.md b/README.md
@@ -9,6 +9,10 @@ All included datasets are listed below. Details and files can be found in the re
 
 ### Supervised
 
+* [binding](supervised/binding)
+* [conservation](supervised/conservation)
+* [disorder](supervised/disorder)
+* [membrane](supervised/membrane)
 * [subcellular location](supervised/scl)
 * [secondary structure](supervised/secondary_structure)
 

diff --git a/scripts/sanity_check_datasets.py b/scripts/sanity_check_datasets.py
@@ -0,0 +1,53 @@
+from pathlib import Path
+
+_test_sets = ["test", "newPISCES364", "casp12", "casp13", "casp14"]
+
+def read_fasta(file_path: Path):
+    seq_records = {}
+    seq_ids = []
+    with open(file_path, "r") as f:
+        lines = f.readlines()
+        for line in lines:
+            if line.startswith(">"):
+                seq_id = line.split(">")[1].split(" ")[0].strip()
+                seq_ids.append(seq_id)
+                target = line.split("TARGET=")[1].split(" ")[0].strip()
+                split = line.split("SET=")[1].split(" ")[0].strip()
+                seq_records[seq_id] = {"target": target, "set": split}
+            else:
+                seq_records[seq_id]["seq"] = line.strip()
+    assert len(seq_records) == len(seq_ids)
+    return seq_records
+
+
+def _check(seq_records: dict[str, dict[str, str]]):
+    assert len(seq_records) > 0
+    assert len(seq_records) == len(set(seq_records.keys()))  # No duplicate ids
+    seqs = [record["seq"] for record in seq_records.values()]
+    assert len(seq_records) == len(set(seqs))  # No duplicate sequences
+
+    train_seqs = set([record["seq"] for record in seq_records.values() if record["set"] == "train"])
+    val_seqs = set([record["seq"] for record in seq_records.values() if record["set"] == "val"])
+    test_seqs = set([record["seq"] for record in seq_records.values() if
+                     record["set"] in _test_sets])
+
+    for seq in test_seqs:
+        assert seq not in train_seqs
+        assert seq not in val_seqs
+
+    for seq_id, record in seq_records.items():
+        assert record["set"] in ["train", "val", "test", *_test_sets]
+        assert len(record["seq"]) > 0
+        target = record["target"]
+        if ";" in target:
+            target = target.split(";")
+        assert len(target) > 0
+        assert len(record["seq"]) == len(target) if len(record["target"]) > 25 else True
+
+
+def sanity_check(dataset_paths: list[Path]):
+    for dataset_path in dataset_paths:
+        print(f"Checking {dataset_path}...")
+        seq_records = read_fasta(dataset_path)
+        _check(seq_records)
+        print(f"Checked {dataset_path}!")
diff --git a/scripts/zip_datasets.py b/scripts/zip_datasets.py
@@ -7,6 +7,8 @@
 import zipfile
 from pathlib import Path
 
+from sanity_check_datasets import sanity_check
+
 
 def main() -> None:
     # Resolve repository root as the parent of this script's directory
@@ -18,11 +20,24 @@ def main() -> None:
     include_paths = [
         Path("LICENSE"),
         Path("README.md"),
+        Path("supervised/binding/binding_combined.fasta"),
+        Path("supervised/binding/binding_metal.fasta"),
+        Path("supervised/binding/binding_nuclear.fasta"),
+        Path("supervised/binding/binding_small.fasta"),
+        Path("supervised/binding/README.md"),
+        Path("supervised/conservation/conservation.fasta"),
+        Path("supervised/conservation/README.md"),
+        Path("supervised/disorder/disorder.fasta"),
+        Path("supervised/disorder/README.md"),
+        Path("supervised/membrane/membrane.fasta"),
+        Path("supervised/membrane/README.md"),
         Path("supervised/scl/scl.fasta"),
         Path("supervised/scl/README.md"),
         Path("supervised/secondary_structure/secondary_structure.fasta"),
         Path("supervised/secondary_structure/README.md"),
     ]
+    fasta_file_paths = [Path("..") / file for file in include_paths if file.name.endswith(".fasta")]
+    sanity_check(fasta_file_paths)
 
     # Archive output path (in repo root)
     archive_path = repo_root / zip_file_name

diff --git a/supervised/binding/README.md b/supervised/binding/README.md
@@ -0,0 +1,53 @@
+# Binding
+
+## Description
+
+The datasets provided here aim at predicting protein binding (2-state).
+We provide the following four datasets:
+* `binding_metal.fasta`: Binding to metal ions (0/1)
+* `binding_nuclear.fasta`: Binding to nucleic acids (0/1)
+* `binding_small.fasta`: Binding to small molecules (0/1)
+* `binding_combined.fasta`: Binding to metal, nucleic acids OR small molecules (0/1)
+
+## Dataset Compilation
+
+The provided dataset was compiled from the 
+[data provided in the bindEmbed repository](https://github.com/Rostlab/bindPredict/tree/master/data). 
+
+* Training: Data from the [development set](https://github.com/Rostlab/bindPredict/tree/master/data/development_set)
+* Validation: Stratified random 10% split of training
+* Test: Data from the [independent set](https://github.com/Rostlab/bindPredict/tree/master/data/independent_set)
+
+## Dataset Format
+
+The dataset is provided in [biotrainer-ready](https://github.com/sacdallago/biotrainer) fasta format.
+Each entry contains a sequence and a header, providing the sequence id, the set (train/val/test) and the target label.
+
+## Dataset Benchmarks
+
+The [bindEmbed paper](https://doi.org/10.1038/s41598-021-03431-4) contains benchmarks 
+for the binding prediction tasks. The [TestSetNew46](https://www.nature.com/articles/s41598-021-03431-4/figures/1)
+is the independent set used for these datasets.
+
+## Citations
+
+```bibtex
+@Article{Littmann2021b,
+  author    = {Littmann, Maria and Heinzinger, Michael and Dallago, Christian and Weissenow, Konstantin and Rost, Burkhard},
+  journal   = {Scientific Reports},
+  title     = {Protein embeddings and deep learning predict binding residues for various ligand classes},
+  year      = {2021},
+  issn      = {2045-2322},
+  month     = dec,
+  number    = {1},
+  volume    = {11},
+  doi       = {10.1038/s41598-021-03431-4},
+  publisher = {Springer Science and Business Media LLC},
+}
+```
+
+## Data licensing
+
+The RAW data downloaded from the aforementioned publication is subject
+to the [MIT license](https://opensource.org/license/MIT).
+Modified data available in this repository falls under [AFL-3](https://opensource.org/licenses/AFL-3.0).