|
| 1 | +from typing import Optional |
| 2 | + |
| 3 | +from pytorch_lightning import LightningDataModule |
| 4 | +from torch.utils.data import DataLoader |
| 5 | + |
| 6 | +from project.datasets.DB5.db5_dgl_dataset import DB5DGLDataset |
| 7 | +from project.utils.deepinteract_utils import dgl_picp_collate |
| 8 | + |
| 9 | + |
| 10 | +class DB5DGLDataModule(LightningDataModule): |
| 11 | + """Unbound protein complex data module for DGL with PyTorch.""" |
| 12 | + |
| 13 | + # Dataset partition instantiations |
| 14 | + db5_train = None |
| 15 | + db5_val = None |
| 16 | + db5_test = None |
| 17 | + |
| 18 | + def __init__(self, data_dir: str, batch_size: int, num_dataloader_workers: int, knn: int, |
| 19 | + self_loops: bool, percent_to_use: float, process_complexes: bool, input_indep: bool): |
| 20 | + super().__init__() |
| 21 | + |
| 22 | + self.data_dir = data_dir |
| 23 | + self.batch_size = batch_size |
| 24 | + self.num_dataloader_workers = num_dataloader_workers |
| 25 | + self.knn = knn |
| 26 | + self.self_loops = self_loops |
| 27 | + self.percent_to_use = percent_to_use # Fraction of DB5 dataset splits to use |
| 28 | + self.process_complexes = process_complexes # Whether to process any unprocessed complexes before training |
| 29 | + self.input_indep = input_indep # Whether to use an input-independent pipeline to train the model |
| 30 | + self.collate_fn = dgl_picp_collate # Which collation function to use |
| 31 | + |
| 32 | + def setup(self, stage: Optional[str] = None): |
| 33 | + # Assign training/validation/testing dataset for use in DataLoaders - called on every GPU |
| 34 | + self.db5_train = DB5DGLDataset(mode='train', raw_dir=self.data_dir, knn=self.knn, |
| 35 | + geo_nbrhd_size=2, self_loops=self.self_loops, |
| 36 | + percent_to_use=self.percent_to_use, |
| 37 | + process_complexes=self.process_complexes, |
| 38 | + input_indep=self.input_indep) |
| 39 | + self.db5_val = DB5DGLDataset(mode='val', raw_dir=self.data_dir, knn=self.knn, |
| 40 | + geo_nbrhd_size=2, self_loops=self.self_loops, |
| 41 | + percent_to_use=self.percent_to_use, |
| 42 | + process_complexes=self.process_complexes, |
| 43 | + input_indep=self.input_indep) |
| 44 | + self.db5_test = DB5DGLDataset(mode='test', raw_dir=self.data_dir, knn=self.knn, |
| 45 | + geo_nbrhd_size=2, self_loops=self.self_loops, |
| 46 | + percent_to_use=self.percent_to_use, |
| 47 | + process_complexes=self.process_complexes, |
| 48 | + input_indep=self.input_indep) |
| 49 | + |
| 50 | + def train_dataloader(self) -> DataLoader: |
| 51 | + return DataLoader(self.db5_train, batch_size=self.batch_size, shuffle=True, |
| 52 | + num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True) |
| 53 | + |
| 54 | + def val_dataloader(self) -> DataLoader: |
| 55 | + return DataLoader(self.db5_val, batch_size=self.batch_size, shuffle=False, |
| 56 | + num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True) |
| 57 | + |
| 58 | + def test_dataloader(self) -> DataLoader: |
| 59 | + return DataLoader(self.db5_test, batch_size=self.batch_size, shuffle=False, |
| 60 | + num_workers=self.num_dataloader_workers, collate_fn=self.collate_fn, pin_memory=True) |
0 commit comments