Skip to content

Commit d0781de

Browse files
committed
Add DB5 dataset support to pipeline
1 parent bfefd17 commit d0781de

13 files changed

+664
-79
lines changed

.gitignore

+6-3
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,15 @@ dmypy.json
133133
.vscode
134134
.run
135135

136-
# DIPS-Plus
137-
project/datasets/DIPS/final
138-
139136
# CASP-CAPRI
140137
project/datasets/CASP_CAPRI/final
141138

139+
# DB5-Plus
140+
project/datasets/DB5/final
141+
142+
# DIPS-Plus
143+
project/datasets/DIPS/final
144+
142145
# Input
143146
project/datasets/Input/raw
144147
project/datasets/Input/interim

README.md

+17-7
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,12 @@ A geometric deep learning pipeline for predicting protein interface contacts.
2121
If you use the code or data associated with this package, please cite:
2222

2323
```bibtex
24-
@article{morehead2021deepinteract,
25-
title = {Geometric Transformers for Protein Interface Contact Prediction},
26-
author = {Alex Morehead, Chen Chen, and Jianlin Cheng},
27-
year = {2021},
28-
eprint = {N/A},
29-
archivePrefix = {arXiv},
30-
primaryClass = {cs.LG}
24+
@inproceedings{morehead2022geometric,
25+
title={Geometric Transformers for Protein Interface Contact Prediction},
26+
author={Alex Morehead and Chen Chen and Jianlin Cheng},
27+
booktitle={International Conference on Learning Representations},
28+
year={2022},
29+
url={https://openreview.net/forum?id=CS4463zx6Hi}
3130
}
3231
```
3332

@@ -126,6 +125,17 @@ DeepInteract
126125
│ │
127126
│ └───builder
128127
│ │
128+
│ └───DB5
129+
│ │ │
130+
│ │ └───final
131+
│ │ │ │
132+
│ │ │ └───processed
133+
│ │ │ │
134+
│ │ │ └───raw
135+
│ │ │
136+
│ │ db5_dgl_data_module.py
137+
│ │ db5_dgl_dataset.py
138+
│ │
129139
│ └───CASP_CAPRI
130140
│ │ │
131141
│ │ └───final

project/datasets/CASP_CAPRI/casp_capri_dgl_data_module.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,15 @@ class CASPCAPRIDGLDataModule(LightningDataModule):
1818
# Dataset partition instantiations
1919
casp_capri_test = None
2020

21-
def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int, self_loops: bool,
22-
pn_ratio: float, percent_to_use: float, process_complexes: bool, input_indep: bool):
21+
def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int,
22+
self_loops: bool, percent_to_use: float, process_complexes: bool, input_indep: bool):
2323
super().__init__()
2424

2525
self.data_dir = data_dir
2626
self.batch_size = batch_size
2727
self.num_dataloader_workers = num_dataloader_workers
2828
self.knn = knn
2929
self.self_loops = self_loops
30-
self.pn_ratio = pn_ratio
3130
self.percent_to_use = percent_to_use # Fraction of CASP-CAPRI dataset splits to use
3231
self.process_complexes = process_complexes # Whether to process any unprocessed complexes before training
3332
self.input_indep = input_indep # Whether to use an input-independent pipeline to train the model
@@ -36,7 +35,7 @@ def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int,
3635
def setup(self, stage: Optional[str] = None):
3736
# Assign testing dataset for use in DataLoaders - called on every GPU
3837
self.casp_capri_test = CASPCAPRIDGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn,
39-
geo_nbrhd_size=2, self_loops=self.self_loops, pn_ratio=self.pn_ratio,
38+
geo_nbrhd_size=2, self_loops=self.self_loops,
4039
percent_to_use=self.percent_to_use,
4140
process_complexes=self.process_complexes,
4241
input_indep=self.input_indep)

project/datasets/CASP_CAPRI/casp_capri_dgl_dataset.py

-5
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@ class CASPCAPRIDGLDataset(DGLDataset):
3434
Size of each edge's neighborhood when updating geometric edge features. Default: 2.
3535
self_loops: bool
3636
Whether to connect a given node to itself. Default: True.
37-
pn_ratio: float
38-
The positive-negative ratio to use when assembling training labels for node-node pairs. Default: 0.1.
3937
percent_to_use: float
4038
How much of the dataset to load. Default: 1.00.
4139
process_complexes: bool
@@ -68,21 +66,18 @@ def __init__(self,
6866
knn=20,
6967
geo_nbrhd_size=2,
7068
self_loops=True,
71-
pn_ratio=0.1,
7269
percent_to_use=1.00,
7370
process_complexes=True,
7471
input_indep=False,
7572
force_reload=False,
7673
verbose=False):
7774
assert mode in ['test']
78-
assert 0.0 < pn_ratio <= 1.0
7975
assert 0.0 < percent_to_use <= 1.0
8076
self.mode = mode
8177
self.root = raw_dir
8278
self.knn = knn
8379
self.geo_nbrhd_size = geo_nbrhd_size
8480
self.self_loops = self_loops
85-
self.pn_ratio = pn_ratio
8681
self.percent_to_use = percent_to_use # How much of the dataset (e.g. CASP-CAPRI training dataset) to use
8782
self.process_complexes = process_complexes # Whether to process any unprocessed complexes before training
8883
self.input_indep = input_indep # Whether to use an input-independent pipeline to train the model
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from typing import Optional
2+
3+
from pytorch_lightning import LightningDataModule
4+
from torch.utils.data import DataLoader
5+
6+
from project.datasets.DB5.db5_dgl_dataset import DB5DGLDataset
7+
from project.utils.deepinteract_utils import dgl_picp_collate
8+
9+
10+
class DB5DGLDataModule(LightningDataModule):
11+
"""Unbound protein complex data module for DGL with PyTorch."""
12+
13+
# Dataset partition instantiations
14+
db5_train = None
15+
db5_val = None
16+
db5_test = None
17+
18+
def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int,
19+
self_loops: bool, percent_to_use: float, process_complexes: bool, input_indep: bool):
20+
super().__init__()
21+
22+
self.data_dir = data_dir
23+
self.batch_size = batch_size
24+
self.num_dataloader_workers = num_dataloader_workers
25+
self.knn = knn
26+
self.self_loops = self_loops
27+
self.percent_to_use = percent_to_use # Fraction of DB5 dataset splits to use
28+
self.process_complexes = process_complexes # Whether to process any unprocessed complexes before training
29+
self.input_indep = input_indep # Whether to use an input-independent pipeline to train the model
30+
self.collate_fn = dgl_picp_collate # Which collation function to use
31+
32+
def setup(self, stage: Optional[str] = None):
33+
# Assign training/validation/testing dataset for use in DataLoaders - called on every GPU
34+
self.db5_train = DB5DGLDataset(mode='train', raw_dir=self.data_dir, knn=self.knn,
35+
geo_nbrhd_size=2, self_loops=self.self_loops,
36+
percent_to_use=self.percent_to_use,
37+
process_complexes=self.process_complexes,
38+
input_indep=self.input_indep)
39+
self.db5_val = DB5DGLDataset(mode='val', raw_dir=self.data_dir, knn=self.knn,
40+
geo_nbrhd_size=2, self_loops=self.self_loops,
41+
percent_to_use=self.percent_to_use,
42+
process_complexes=self.process_complexes,
43+
input_indep=self.input_indep)
44+
self.db5_test = DB5DGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn,
45+
geo_nbrhd_size=2, self_loops=self.self_loops,
46+
percent_to_use=self.percent_to_use,
47+
process_complexes=self.process_complexes,
48+
input_indep=self.input_indep)
49+
50+
def train_dataloader(self) -> DataLoader:
51+
return DataLoader(self.db5_train, batch_size=self.batch_size, shuffle=True,
52+
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
53+
54+
def val_dataloader(self) -> DataLoader:
55+
return DataLoader(self.db5_val, batch_size=self.batch_size, shuffle=False,
56+
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)
57+
58+
def test_dataloader(self) -> DataLoader:
59+
return DataLoader(self.db5_test, batch_size=self.batch_size, shuffle=False,
60+
num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True)

0 commit comments

Comments
 (0)