-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathfasta.py
More file actions
36 lines (29 loc) · 1023 Bytes
/
fasta.py
File metadata and controls
36 lines (29 loc) · 1023 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from Bio import SeqIO
from torch.utils.data import Dataset
from utils import encode_seq, reverse_complement
class FastaDataset(Dataset):
def __init__(self,
file_path: str,
reverse: bool = False):
super().__init__()
self.file_path = file_path
self.seqs = list(SeqIO.parse(file_path, format="fasta"))
self.reverse = reverse
def seq_names(self) -> list[str]:
return [seq.id for seq in self.seqs]
def raw_seqs(self) -> list[str]:
return [self._get_seq(i) for i in range(0, len(self))]
def _get_seq(self, idx):
seq = self.seqs[idx]
seq = seq.seq._data
if isinstance(seq, bytes):
seq = seq.decode()
if self.reverse:
seq = reverse_complement(seq)
return seq
def __getitem__(self, idx):
seq = self._get_seq(idx)
seq = encode_seq(seq)
return seq
def __len__(self) -> int:
return len(self.seqs)