BioinfoMachineLearning
diff --git a/‎.gitignore
+6-3 b/‎.gitignore
+6-3
diff --git a/‎README.md
+17-7 b/‎README.md
+17-7
diff --git a/‎project/datasets/CASP_CAPRI/casp_capri_dgl_data_module.py
+3-4 b/‎project/datasets/CASP_CAPRI/casp_capri_dgl_data_module.py
+3-4
diff --git a/‎project/datasets/CASP_CAPRI/casp_capri_dgl_dataset.py
-5 b/‎project/datasets/CASP_CAPRI/casp_capri_dgl_dataset.py
-5
diff --git a/‎project/datasets/DB5/db5_dgl_data_module.py
+60 b/‎project/datasets/DB5/db5_dgl_data_module.py
+60
@@ -133,12 +133,15 @@ dmypy.json
 .vscode
 .run
 
-# DIPS-Plus
-project/datasets/DIPS/final
-
 # CASP-CAPRI
 project/datasets/CASP_CAPRI/final
 
+# DB5-Plus
+project/datasets/DB5/final
+
+# DIPS-Plus
+project/datasets/DIPS/final
+
 # Input
 project/datasets/Input/raw
 project/datasets/Input/interim
 
@@ -21,13 +21,12 @@ A geometric deep learning pipeline for predicting protein interface contacts.
 If you use the code or data associated with this package, please cite:
 
 ```bibtex
-@article{morehead2021deepinteract,
-  title = {Geometric Transformers for Protein Interface Contact Prediction},
-  author = {Alex Morehead, Chen Chen, and Jianlin Cheng},
-  year = {2021},
-  eprint = {N/A},
-  archivePrefix = {arXiv},
-  primaryClass = {cs.LG}
+@inproceedings{morehead2022geometric,
+  title={Geometric Transformers for Protein Interface Contact Prediction},
+  author={Alex Morehead and Chen Chen and Jianlin Cheng},
+  booktitle={International Conference on Learning Representations},
+  year={2022},
+  url={https://openreview.net/forum?id=CS4463zx6Hi}
 }
 ```
 
@@ -126,6 +125,17 @@ DeepInteract
      │   │
      │   └───builder
      │   │
+     │   └───DB5
+     │   │   │
+     │   │   └───final
+     │   │   │   │
+     │   │   │   └───processed
+     │   │   │   │
+     │   │   │   └───raw
+     │   │   │
+     │   │   db5_dgl_data_module.py
+     │   │   db5_dgl_dataset.py
+     │   │
      │   └───CASP_CAPRI
      │   │   │
      │   │   └───final
 
@@ -18,16 +18,15 @@ class CASPCAPRIDGLDataModule(LightningDataModule):
     # Dataset partition instantiations
     casp_capri_test = None
 
-    def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int, self_loops: bool,
-                 pn_ratio: float, percent_to_use: float, process_complexes: bool, input_indep: bool):
+    def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int,
+                 self_loops: bool, percent_to_use: float, process_complexes: bool, input_indep: bool):
         super().__init__()
 
         self.data_dir = data_dir
         self.batch_size = batch_size
         self.num_dataloader_workers = num_dataloader_workers
         self.knn = knn
         self.self_loops = self_loops
-        self.pn_ratio = pn_ratio
         self.percent_to_use = percent_to_use  # Fraction of CASP-CAPRI dataset splits to use
         self.process_complexes = process_complexes  # Whether to process any unprocessed complexes before training
         self.input_indep = input_indep  # Whether to use an input-independent pipeline to train the model
@@ -36,7 +35,7 @@ def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int,
     def setup(self, stage: Optional[str] = None):
         # Assign testing dataset for use in DataLoaders - called on every GPU
         self.casp_capri_test = CASPCAPRIDGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn,
-                                                   geo_nbrhd_size=2, self_loops=self.self_loops, pn_ratio=self.pn_ratio,
+                                                   geo_nbrhd_size=2, self_loops=self.self_loops,
                                                    percent_to_use=self.percent_to_use,
                                                    process_complexes=self.process_complexes,
                                                    input_indep=self.input_indep)
 
@@ -34,8 +34,6 @@ class CASPCAPRIDGLDataset(DGLDataset):
         Size of each edge's neighborhood when updating geometric edge features. Default: 2.
     self_loops: bool
         Whether to connect a given node to itself. Default: True.
-    pn_ratio: float
-        The positive-negative ratio to use when assembling training labels for node-node pairs. Default: 0.1.
     percent_to_use: float
         How much of the dataset to load. Default: 1.00.
     process_complexes: bool
@@ -68,21 +66,18 @@ def __init__(self,
                  knn=20,
                  geo_nbrhd_size=2,
                  self_loops=True,
-                 pn_ratio=0.1,
                  percent_to_use=1.00,
                  process_complexes=True,
                  input_indep=False,
                  force_reload=False,
                  verbose=False):
         assert mode in ['test']
-        assert 0.0 < pn_ratio <= 1.0
         assert 0.0 < percent_to_use <= 1.0
         self.mode = mode
         self.root = raw_dir
         self.knn = knn
         self.geo_nbrhd_size = geo_nbrhd_size
         self.self_loops = self_loops
-        self.pn_ratio = pn_ratio
         self.percent_to_use = percent_to_use  # How much of the dataset (e.g. CASP-CAPRI training dataset) to use
         self.process_complexes = process_complexes  # Whether to process any unprocessed complexes before training
         self.input_indep = input_indep  # Whether to use an input-independent pipeline to train the model
 
@@ -0,0 +1,60 @@
+from typing import Optional
+
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import DataLoader
+
+from project.datasets.DB5.db5_dgl_dataset import DB5DGLDataset
+from project.utils.deepinteract_utils import dgl_picp_collate
+
+
+class DB5DGLDataModule(LightningDataModule):
+    """Unbound protein complex data module for DGL with PyTorch."""
+
+    # Dataset partition instantiations
+    db5_train = None
+    db5_val = None
+    db5_test = None
+
+    def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int,
+                 self_loops: bool, percent_to_use: float, process_complexes: bool, input_indep: bool):
+        super().__init__()
+
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.num_dataloader_workers = num_dataloader_workers
+        self.knn = knn
+        self.self_loops = self_loops
+        self.percent_to_use = percent_to_use  # Fraction of DB5 dataset splits to use
+        self.process_complexes = process_complexes  # Whether to process any unprocessed complexes before training
+        self.input_indep = input_indep  # Whether to use an input-independent pipeline to train the model
+        self.collate_fn = dgl_picp_collate  # Which collation function to use
+
+    def setup(self, stage: Optional[str] = None):
+        # Assign training/validation/testing dataset for use in DataLoaders - called on every GPU
+        self.db5_train = DB5DGLDataset(mode='train', raw_dir=self.data_dir, knn=self.knn,
+                                       geo_nbrhd_size=2, self_loops=self.self_loops,
+                                       percent_to_use=self.percent_to_use,
+                                       process_complexes=self.process_complexes,
+                                       input_indep=self.input_indep)
+        self.db5_val = DB5DGLDataset(mode='val', raw_dir=self.data_dir, knn=self.knn,
+                                     geo_nbrhd_size=2, self_loops=self.self_loops,
+                                     percent_to_use=self.percent_to_use,
+                                     process_complexes=self.process_complexes,
+                                     input_indep=self.input_indep)
+        self.db5_test = DB5DGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn,
+                                      geo_nbrhd_size=2, self_loops=self.self_loops,
+                                      percent_to_use=self.percent_to_use,
+                                      process_complexes=self.process_complexes,
+                                      input_indep=self.input_indep)
+
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(self.db5_train, batch_size=self.batch_size, shuffle=True,
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
+
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(self.db5_val, batch_size=self.batch_size, shuffle=False,
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
+
+    def test_dataloader(self) -> DataLoader:
+        return DataLoader(self.db5_test, batch_size=self.batch_size, shuffle=False,
+                          num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)