Rostlab
diff --git a/‎README.md‎
Lines changed: 80 additions & 81 deletions b/‎README.md‎
Lines changed: 80 additions & 81 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎vespa/__init__.py‎
Lines changed: 5 additions & 5 deletions b/‎vespa/__init__.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎vespa/predict/config.py‎
Lines changed: 8 additions & 4 deletions b/‎vespa/predict/config.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎vespa/predict/embedding.py‎
Lines changed: 95 additions & 0 deletions b/‎vespa/predict/embedding.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎vespa/predict/logodds.py‎
Lines changed: 5 additions & 22 deletions b/‎vespa/predict/logodds.py‎
Lines changed: 5 additions & 22 deletions
diff --git a/‎vespa/predict/utils_t5.py‎
Lines changed: 38 additions & 0 deletions b/‎vespa/predict/utils_t5.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎vespa/scripts/conspred.py‎
Lines changed: 44 additions & 21 deletions b/‎vespa/scripts/conspred.py‎
Lines changed: 44 additions & 21 deletions
@@ -1,8 +1,8 @@
 [tool.poetry]
 name = "vespa"
-version = "0.3.0-beta"
+version = "0.9.0-beta"
 description = ""
-authors = ["Tobias O <tobias.olenyi@tum.de>"]
+authors = ["Tobias O <tobias.olenyi@tum.de>", "Duc Anh Le <ducanh.le@tum.de>"]
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.11"
@@ -25,8 +25,10 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
 vespa_logodds = 'vespa.scripts.logodds:main'
-vespa = 'vespa.scripts.vespa:main'
+vespa = 'vespa.scripts.meta:main'
 vespa_conspred = 'vespa.scripts.conspred:main'
+vespa_run = 'vespa.scripts.vespa_run:main'
+vespa_emb = 'vespa.scripts.embedding:main'
 
 [tool.poetry2conda]
 name = "vespa-env"
 
@@ -1,7 +1,7 @@
-__author__ = "Tobias O, Michael H., Celine M."
-__copyright__ = "Copyright 2021, Rostlab"
+__author__ = "Tobias O, Michael H., Celine M., Duc Anh L."
+__copyright__ = "Copyright 2022, Rostlab"
 __license__ = "AGPL-3.0-or-later"
-__version__ = "1.0.0"
-__maintainer__ = "Tobias O."
+__version__ = "0.9.0b"
+__maintainer__ = ["Duc Anh L.", "Tobias O."]
 __email__ = ""
-__status__ = "Production"
+__status__ = "Production"
@@ -21,14 +21,15 @@
 from pathlib import Path
 import torch
 
+VESPA_LOCATION = Path(__file__).resolve().parent.parent.parent
 
 VESPA = "VESPA"
 VESPAL = "VESPAl"
 
 MODEL_PATH_DICT = {
-    VESPA: Path("models/VESPA-10LR_Cons_Blsm_Prob.pkl"),
-    VESPAL: Path("models/VESPAl-10LR_Cons_Blsm.pkl"),
-    "CONSCNN": Path("models/ProtT5cons_checkpoint.pt"),
+    VESPA: Path(VESPA_LOCATION.joinpath("models/VESPA-10LR_Cons_Blsm_Prob.pkl")),
+    VESPAL: Path(VESPA_LOCATION.joinpath("models/VESPAl-10LR_Cons_Blsm.pkl")),
+    "CONSCNN": Path(VESPA_LOCATION.joinpath("models/ProtT5cons_checkpoint.pt")),
 }
 
 OUTPUT_MAP_NAME = "map.json"
@@ -37,7 +38,7 @@
 # https://huggingface.co/transformers/v3.1.0/_modules/transformers/tokenization_t5.html
 SPIECE_UNDERLINE = "▁"
 
-CACHE_DIR = Path("./cache")
+CACHE_DIR = "./cache"
 
 TRANSFORMER_LINK = "Rostlab/prot_t5_xl_uniref50"
 
@@ -55,3 +56,6 @@
 
 DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 EMBEDDING_HALF_PREC = True
+
+EMBED, LOGODDS = 0, 1
+EMB_MAX_SEQ_LEN, EMB_MAX_RESIDUES, EMB_MAX_BATCH, EMB_STORE_FREQ = 600, 8000, 5, 200
@@ -0,0 +1,95 @@
+import numpy
+import torch
+import h5py
+from tqdm import tqdm
+from pathlib import Path
+
+from vespa.predict.config import (
+    DEVICE, CACHE_DIR, VERBOSE,
+    EMBED, EMB_MAX_SEQ_LEN, EMB_MAX_RESIDUES, EMB_MAX_BATCH, EMB_STORE_FREQ
+)
+from vespa.predict.utils import parse_fasta_input
+from vespa.predict.utils_t5 import ProtT5
+
+
+class T5_Embed:
+    def __init__(self, cache_dir):
+        self.prott5 = ProtT5(cache_dir)
+        self.saving_pattern = 'w'
+
+    def embed_from_fasta(self, fasta_path, output_path):
+        self.saving_pattern = 'w'
+        if VERBOSE:
+            print('Load model: ProtT5')
+        self.model, self.tokenizer = self.prott5.get_model(EMBED)
+        if VERBOSE:
+            print('Compute embeddings!')
+        self.get_embeddings(fasta_path, output_path)
+
+    def embedding_init(self, fasta_path):
+        seq_dict = parse_fasta_input(fasta_path)
+        seq_dict = sorted(seq_dict.items(), key=lambda kv: len(seq_dict[kv[0]]), reverse=True)
+        return seq_dict
+
+    def process_batch(self, batch, emb_dict):
+        pdb_ids, seqs, seq_lens = zip(*batch)
+
+        token_encoding = self.tokenizer(seqs, add_special_tokens=True, padding='longest', return_tensors="pt")
+        input_ids = token_encoding['input_ids'].to(DEVICE)
+        attention_mask = token_encoding['attention_mask'].to(DEVICE)
+
+        try:
+            # batch-size x seq_len x embedding_dim
+            with torch.no_grad():
+                embedding_repr = self.model(input_ids, attention_mask=attention_mask)
+        except RuntimeError:
+            print("RuntimeError for {} (L={})".format(pdb_ids, seq_lens))
+            return emb_dict
+
+        new_emb_dict = dict()
+        for batch_idx, identifier in enumerate(pdb_ids):
+            s_len = seq_lens[batch_idx]
+            emb = embedding_repr.last_hidden_state[batch_idx, :s_len]
+            new_emb_dict[identifier] = emb.detach().cpu().numpy().squeeze()
+
+        if new_emb_dict:
+            emb_dict.update(new_emb_dict)
+        return emb_dict
+
+    def save_embeddings(self, output_path, emb_dict):
+        Path(str(output_path.absolute())).parent.mkdir(parents=True, exist_ok=True)
+        with h5py.File(str(output_path.absolute()), self.saving_pattern) as hf:
+            for sequence_id, embedding in emb_dict.items():
+                hf.create_dataset(sequence_id, data=embedding)
+        self.saving_pattern = 'a'
+
+    def get_embeddings(self, fasta_path, output_path):
+        seq_dict = self.embedding_init(fasta_path)
+
+        emb_dict = dict()
+        batch, n_res_batch = [], 0
+
+        for seq_idx, (pdb_id, seq) in tqdm(enumerate(seq_dict, 1), total=len(seq_dict)):
+            seq_len = len(seq)
+            seq = ' '.join(list(seq))
+
+            if seq_len >= EMB_MAX_SEQ_LEN:
+                emb_dict = self.process_batch([(pdb_id, seq, seq_len)], emb_dict)
+            else:
+                if len(batch) >= EMB_MAX_BATCH or n_res_batch >= EMB_MAX_RESIDUES:
+                    emb_dict = self.process_batch(batch, emb_dict)
+                    batch = []
+                    n_res_batch = 0
+
+                batch.append((pdb_id, seq, seq_len))
+                n_res_batch += seq_len
+
+            if len(emb_dict) > EMB_STORE_FREQ:
+                self.save_embeddings(output_path, emb_dict)
+                emb_dict = dict()
+
+        if batch:
+            emb_dict = self.process_batch(batch, emb_dict)
+
+        if emb_dict:
+            self.save_embeddings(output_path, emb_dict)
@@ -28,7 +28,6 @@
 import torch
 import h5py
 from tqdm import tqdm
-from transformers import T5ForConditionalGeneration, T5Tokenizer
 
 from vespa.predict.config import (
     CACHE_DIR,
@@ -40,8 +39,10 @@
     SPIECE_UNDERLINE,
     VERBOSE,
     DEVICE,
+    LOGODDS
 )
 from vespa.predict import utils
+from vespa.predict.utils_t5 import ProtT5
 
 
 if VERBOSE:
@@ -58,8 +59,8 @@ class _ProbaVector:
 
 class T5_condProbas:
     def __init__(self, cache_dir):
-        self.cache_dir = cache_dir
-        self.tokenizer = self.get_tokenizer()
+        self.prott5 = ProtT5(cache_dir)
+        self.tokenizer = self.prott5.get_tokenizer()
         self.AAs = MUTANT_ORDER + "X"
         self.AA2class = {AA: idx for idx, AA in enumerate(self.AAs)}
         self.class2AA = {idx: AA for idx, AA in enumerate(self.AAs)}
@@ -69,24 +70,6 @@ def __init__(self, cache_dir):
         ]
         self.softmax = torch.nn.Softmax(dim=0)
 
-    def get_model(self):
-        model = T5ForConditionalGeneration.from_pretrained(
-            TRANSFORMER_LINK, cache_dir=self.cache_dir
-        )
-        model = model.eval()
-        model = model.to(DEVICE)
-        vocab = T5Tokenizer.from_pretrained(
-            TRANSFORMER_LINK, do_lower_case=False, cache_dir=self.cache_dir
-        )
-
-        return model, vocab
-
-    def get_tokenizer(self):
-        vocab = T5Tokenizer.from_pretrained(
-            TRANSFORMER_LINK, do_lower_case=False, cache_dir=self.cache_dir
-        )
-        return vocab
-
     def reconstruct_sequence(self, probs):
         return [self.class2AA[yhat] for yhat in probs.argmax(axis=1)]
 
@@ -116,7 +99,7 @@ def get_proba_dict(self, seq_dict, mutation_gen: utils.MutationGenerator):
         Compute for all residues in a protein the conditional probabilities for reconstructing single, masked tokens.
         """
 
-        self.model, self.tokenizer = self.get_model()
+        self.model, self.tokenizer = self.prott5.get_model(LOGODDS)
 
         result_dict = dict()
 
 
@@ -0,0 +1,38 @@
+from transformers import T5ForConditionalGeneration, T5EncoderModel, T5Tokenizer
+from transformers import logging
+logging.set_verbosity_error()
+from vespa.predict.config import (
+    TRANSFORMER_LINK,
+    DEVICE,
+    EMBED, EMBEDDING_HALF_PREC, LOGODDS
+)
+
+
+class ProtT5:
+    def __init__(self, cache_dir):
+        self.cache_dir = cache_dir
+
+    def get_model(self, model_usage: EMBED | LOGODDS):
+        if model_usage == EMBED:
+            model = T5EncoderModel.from_pretrained(
+                TRANSFORMER_LINK, cache_dir=self.cache_dir
+            )
+            if EMBEDDING_HALF_PREC:
+                model = model.half()
+        elif model_usage == LOGODDS:
+            model = T5ForConditionalGeneration.from_pretrained(
+                TRANSFORMER_LINK, cache_dir=self.cache_dir
+            )
+        else:
+            raise NotImplementedError(
+                "The intended use of ProtT5 is not implemented."
+            )
+        model = model.eval()
+        model = model.to(DEVICE)
+        return model, self.get_tokenizer()
+
+    def get_tokenizer(self):
+        tokenizer = T5Tokenizer.from_pretrained(
+            TRANSFORMER_LINK, do_lower_case=False, cache_dir=self.cache_dir
+        )
+        return tokenizer
@@ -31,7 +31,7 @@
 
 # Lib Imports
 import torch.utils.data
-from vespa.predict.config import MODEL_PATH_DICT
+from vespa.predict.config import MODEL_PATH_DICT, VERBOSE
 
 # Module Imports
 from vespa.predict.conspred import ProtT5Cons, get_dataloader
@@ -50,7 +50,7 @@ def create_arg_parser():
 
     # Required positional argument
     parser.add_argument(
-        "Input",
+        "input",
         type=Path,
         help="A path to a h5 embedding file, containing per-residue ProtT5 embeddings.",
     )
@@ -71,15 +71,15 @@ def create_arg_parser():
         "--checkpoint",
         required=False,
         type=Path,
-        default= None,
+        default=None,
         help="A path for the pre-trained checkpoint for the conservation CNN",
     )
 
     # Optional argument
     parser.add_argument(
         "--output_probs",
         type=bool,
-        default=True, 
+        default=True,
         action=argparse.BooleanOptionalAction,
         help="Output probabilities for all classes, not only class with highest probability. The probabilities are stored in an h5 file with a dataset per-protein of shape Lx20 (L being the protein length). This output is written to <output_prefix>_probs.h5)",
     )
@@ -88,37 +88,60 @@ def create_arg_parser():
     parser.add_argument(
         "--output_classes",
         type=bool,
-        default=True, 
+        default=False,
         action=argparse.BooleanOptionalAction,
         help="Output the conservation class prediction per residue in a fasta-like format with comma-separated per-residue classes. The output is written to <output_prefix>_class.fast)",
     )
 
     return parser
 
 
-def main():
-    parser = create_arg_parser()
-    args = parser.parse_args()
-
-    checkpoint_path = args.checkpoint if args.checkpoint else Path(MODEL_PATH_DICT["CONSCNN"])
-    out_prefix = args.output_prefix
-    out_class = Path(out_prefix + "_class.fasta")
-    out_probs = Path(out_prefix + "_probs.h5")
-
-    write_probs = args.output_probs
-    write_classes = args.output_classes
-
-    out_class.parent.mkdir(parents=True, exist_ok=True)
-
+def run_conspred(seq_path, checkpoint_path, write_probs, write_classes, out_prefix):
     try:
-        embeddings = h5py.File(str(args.Input.resolve()), 'r')
+        if VERBOSE:
+            print(f" Start Conservation Prediction ".center(80, "#"))
+        embeddings = h5py.File(str(seq_path.resolve()), "r")
         data_loader = get_dataloader(embeddings, batch_size=128)
+        if VERBOSE:
+            print(f" Load model! ")
         conspred = ProtT5Cons(checkpoint_path)
-        predictions = conspred.conservation_prediction(data_loader, prob_return=write_probs, class_return=write_classes)
+        if VERBOSE:
+            print(f" Predict Conservation! ")
+        predictions = conspred.conservation_prediction(
+            data_loader, prob_return=write_probs, class_return=write_classes
+        )
+
+        out_class = Path(str(out_prefix) + "_class.fasta")
+        out_probs = Path(str(out_prefix) + "_probs.h5")
+        out_class.parent.mkdir(parents=True, exist_ok=True)
+
         if write_classes:
             conspred.write_cons_class_pred(predictions, out_class)
         if write_probs:
             conspred.write_probabilities(predictions, out_probs)
+
+        if VERBOSE:
+            print(f">> Finished Conservation Prediction!")
     finally:
         embeddings.close()
 
+
+def main():
+    parser = create_arg_parser()
+    args = parser.parse_args()
+
+    arguments = {
+        "seq_path": args.input,
+        "checkpoint_path": args.checkpoint
+        if args.checkpoint
+        else Path(MODEL_PATH_DICT["CONSCNN"]),
+        "out_prefix": args.output_prefix,
+        "write_probs": args.output_probs,
+        "write_classes": args.output_classes,
+    }
+
+    run_conspred(**arguments)
+
+
+if __name__ == "__main__":
+    main()