diff --git a/kempner_workflow/protein_fold_gpu/README.md b/kempner_workflow/protein_fold_gpu/README.md index cedf7677a..5b11072e8 100644 --- a/kempner_workflow/protein_fold_gpu/README.md +++ b/kempner_workflow/protein_fold_gpu/README.md @@ -1,8 +1,8 @@ -# Protein Folding on GPU +# Protein Folding on GPU -This directory provides an example workflow for running **Boltz-based protein folding using GPU resources**. +This directory provides an example workflow for running **Boltz-based protein folding using GPU resources**. It is designed for environments with GPU access, offering reproducible and accessible protein structure prediction. --- @@ -11,7 +11,7 @@ It is designed for environments with GPU access, offering reproducible and acces This workflow executes a protein structure prediction pipeline on GPU using the **Boltz** framework. It demonstrates: -- Running **ColabFold** search locally on the Kempner Cluster +- Running **ColabFold** search locally on the Kempner Cluster - Using the generated MSA file (`.a3m` extension) as input to **Boltz** for structure prediction --- @@ -20,19 +20,19 @@ This workflow executes a protein structure prediction pipeline on GPU using the - Python ≥ 3.10 - cuda and cudann libraries -- **Boltz** library -- **ColabFold** -- **Boltz database** -- **ColabFold database** +- **Boltz** library +- **ColabFold** +- **Boltz database** +- **ColabFold database** -> **Note:** All of these are pre-installed on the Kempner Cluster. +> **Note:** All of these are pre-installed on the Kempner Cluster. > Installation in your own space is optional. --- ## Input Format -Create an input FASTA file. +Create an input FASTA file. **Important:** Currently, the pipeline supports only FASTA format. **Example:** @@ -45,7 +45,7 @@ QLEDSEVEAVAKGLEEMYANGVTEDNFKNYVKNNFAQQEISSVEEELNVNISDSCVANKIKDEFFAMISISAIVKAAQKK ## Running the Workflow -Open the file file `boltz_single_pipeline_gpu.slrm` and define the variable with the correct input fasta filename, and the GPU specifications. +Open the file file `boltz_single_pipeline_gpu.slrm` and define the variable with the correct input fasta filename, and the GPU specifications. ``` INPUT_FASTA="input.fa" export CUDA_VISIBLE_DEVICES=0 @@ -59,10 +59,33 @@ To submit the Slurm batch job: sbatch boltz_single_pipeline_gpu.slrm ``` -Update the SLURM script to adjust job resources (e.g., GPU. CPU cores, memory) as needed. You need to add partition name and account name. +Update the SLURM script to adjust job resources (e.g., GPU. CPU cores, memory) as needed. You need to add partition name and account name. --- + +### Generating Boltz Predictions on Multiple Fasta Files + +To generate the Boltz predictions on the Kempner cluster you run from the login node the following command: + +```{bash} +source slrm_scripts/multi_pred.sh INPUT_DIR N OUT_DIR +``` + +The script will: + +- Divide the input dir files into n sets, generate .txt containing the path to each .fasta (one per set) +- create an out_dir/chunks_timestamp/ directory where the predictions will be stored + +- start N jobs launching the script: slrm_scripts/single_prediction.slrm n times (you can modify the resource of each job by modifying this script) + +- Predictions are saved as: + +out_dir/chunks_timestamp/ + job_id/ + boltz/ # prediction boltz + msa/ # msa generated + ## Output ### 1. ColabFold Search Output @@ -74,9 +97,9 @@ Output_colabfold/local_search_gpu/ ### 2. Boltz Workflow Output Includes: -- 3D structures (PDB/CIF) of predicted protein conformations -- Logs of runtime performance and errors -- Folding quality metrics (if implemented) +- 3D structures (PDB/CIF) of predicted protein conformations +- Logs of runtime performance and errors +- Folding quality metrics (if implemented) Example structure: ``` diff --git a/kempner_workflow/protein_fold_gpu/single_prediction_array.slrm b/kempner_workflow/protein_fold_gpu/single_prediction_array.slrm new file mode 100644 index 000000000..baef1c919 --- /dev/null +++ b/kempner_workflow/protein_fold_gpu/single_prediction_array.slrm @@ -0,0 +1,104 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gpus-per-node=1 +#SBATCH --mem=256GB +#SBATCH --partition=kempner_requeue +#SBATCH --account=kempner_bsabatini_lab +#SBATCH --time=4:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=thomasbush52@gmail.com +# Use array-aware log names to avoid clobbering: +#SBATCH --output=/n/home06/tbush/job_logs/%x.%A_%a.out + + +set -euo pipefail + +# Select this task's chunk file from the manifest +: "${SLURM_ARRAY_TASK_ID:?Need SLURM_ARRAY_TASK_ID}" +: "${MANIFEST:?Need MANIFEST exported from sbatch}" +: "${BASE_OUTPUT_DIR:?Need BASE_OUTPUT_DIR exported from sbatch}" + +LIST_FILE="$(sed -n "${SLURM_ARRAY_TASK_ID}p" "$MANIFEST")" +if [[ -z "${LIST_FILE}" || ! -s "${LIST_FILE}" ]]; then + echo "Task ${SLURM_ARRAY_TASK_ID}: missing or empty LIST_FILE from manifest." + exit 1 +fi + +RUN_TAG="${SLURM_ARRAY_JOB_ID:-${SLURM_JOB_ID:-manual}}_${SLURM_ARRAY_TASK_ID}" +OUTPUT_DIR="${BASE_OUTPUT_DIR}/${RUN_TAG}" +MMSEQ2_DB="/n/holylfs06/LABS/kempner_shared/Everyone/workflow/boltz/mmseq2_db" +COLABFOLD_OUTPUT_BASE="${OUTPUT_DIR}/msa" +BOLTZ_CACHE="/n/holylfs06/LABS/kempner_shared/Everyone/workflow/boltz/boltz_db" +BOLTZ_OUTPUT_BASE="${OUTPUT_DIR}/boltz" +THREADS=${SLURM_CPUS_PER_TASK:-1} + +mkdir -p "$COLABFOLD_OUTPUT_BASE" "$BOLTZ_OUTPUT_BASE" + +export CUDA_VISIBLE_DEVICES=0 +export NUM_GPU_DEVICES=1 + +module load python/3.12.8-fasrc01 gcc/14.2.0-fasrc01 cuda/12.9.1-fasrc01 cudnn/9.10.2.21_cuda12-fasrc01 +export PATH="/n/holylfs06/LABS/kempner_shared/Everyone/common_envs/miniconda3/envs/boltz/localcolabfold/colabfold-conda/bin:$PATH" +export COLABFOLD_DB=/n/holylfs06/LABS/kempner_shared/Everyone/workflow/boltz/colabfold_db + +echo "Task ${SLURM_ARRAY_TASK_ID}: LIST_FILE=${LIST_FILE}" +echo "Outputs -> ${OUTPUT_DIR}" +echo "Threads -> ${THREADS}" + +while IFS= read -r INPUT_FASTA; do + [[ -z "$INPUT_FASTA" ]] && continue + + BASENAME=$(basename "$INPUT_FASTA" .fa) # strip extension for naming + COLABFOLD_OUTPUT_DIR="${COLABFOLD_OUTPUT_BASE}/${BASENAME}" + BOLTZ_OUTPUT_DIR="${BOLTZ_OUTPUT_BASE}/${BASENAME}" + TEMP_FASTA="${BASENAME}_prot_pipeline.fasta" + + echo "===============================================" + echo "Processing: $INPUT_FASTA" + echo "Output dirs: $COLABFOLD_OUTPUT_DIR | $BOLTZ_OUTPUT_DIR" + echo "===============================================" + + mkdir -p "$COLABFOLD_OUTPUT_DIR" "$BOLTZ_OUTPUT_DIR" + + # STEP 1: ColabFold search + colabfold_search "$INPUT_FASTA" "$MMSEQ2_DB" "$COLABFOLD_OUTPUT_DIR" --thread "$THREADS" --gpu 1 + if [ $? -ne 0 ]; then + echo "ERROR: ColabFold search failed for $INPUT_FASTA" + continue + fi + + # STEP 2: Find a3m file + HEADER=$(head -n1 "$INPUT_FASTA") + PROTEIN_PREFIX=$(echo "$HEADER" | sed 's/^>//' | sed 's/|/_/g') + A3M_FILE="${COLABFOLD_OUTPUT_DIR}/${PROTEIN_PREFIX}.a3m" + if [ ! -f "$A3M_FILE" ]; then + A3M_FILE=$(find "$COLABFOLD_OUTPUT_DIR" -name "*.a3m" | head -n1) + if [ -z "$A3M_FILE" ]; then + echo "No a3m found for $INPUT_FASTA" + continue + fi + fi + + # STEP 3: Create Boltz FASTA + A3M_ABSOLUTE=$(realpath "$A3M_FILE") + ORIGINAL_HEADER=$(head -n1 "$INPUT_FASTA") + SEQUENCE=$(tail -n+2 "$INPUT_FASTA") + echo "${ORIGINAL_HEADER}${A3M_ABSOLUTE}" > "$TEMP_FASTA" + echo "$SEQUENCE" >> "$TEMP_FASTA" + + # STEP 4: Run Boltz + mamba activate /n/holylfs06/LABS/kempner_shared/Everyone/common_envs/miniconda3/envs/boltz + boltz predict "$TEMP_FASTA" --cache "$BOLTZ_CACHE" --out_dir "$BOLTZ_OUTPUT_DIR" --devices $NUM_GPU_DEVICES --accelerator gpu --no_kernels + if [ $? -ne 0 ]; then + echo "ERROR: Boltz failed for $INPUT_FASTA" + continue + fi + + echo "Completed successfully: $INPUT_FASTA" +done < "$LIST_FILE" + +echo "===============================================" +echo "All jobs from $LIST_FILE finished" +echo "===============================================" diff --git a/kempner_workflow/protein_fold_gpu/split_and_pred.sh b/kempner_workflow/protein_fold_gpu/split_and_pred.sh new file mode 100755 index 000000000..aeab11e93 --- /dev/null +++ b/kempner_workflow/protein_fold_gpu/split_and_pred.sh @@ -0,0 +1,90 @@ +#!/bin/bash +set -euo pipefail + +# Usage: ./split_and_submit.sh INPUT_DIR N OUTPUT_PARENT_DIR +# Example: ./split_and_submit.sh /data/images 5 /data/jobs +# Optional: set ARRAY_MAX_CONCURRENCY (default 10) + +INPUT_DIR="${1:-}" +N="${2:-}" +OUTPUT_PARENT_DIR="${3:-}" +ARRAY_MAX_CONCURRENCY="${ARRAY_MAX_CONCURRENCY:-10}" + +if [[ -z "${INPUT_DIR}" || -z "${N}" || -z "${OUTPUT_PARENT_DIR}" ]]; then + echo "Usage: $0 INPUT_DIR N OUTPUT_PARENT_DIR" + exit 1 +fi +if ! [[ "$N" =~ ^[0-9]+$ && "$N" -ge 1 ]]; then + echo "N must be a positive integer" + exit 1 +fi + +# Normalize to absolute paths +INPUT_DIR="$(realpath -m "$INPUT_DIR")" +OUTPUT_PARENT_DIR="$(realpath -m "$OUTPUT_PARENT_DIR")" + +# Make timestamped output directory that will also hold logs + manifest +TS="$(date +%Y%m%d_%H%M%S)" +OUTPUT_DIR="${OUTPUT_PARENT_DIR}/chunks_${TS}" +mkdir -p "$OUTPUT_DIR" + +echo "Writing chunk files to: $OUTPUT_DIR" + +# Collect files (absolute paths), null-safe & sorted +mapfile -d '' -t files < <(find "$INPUT_DIR" -maxdepth 1 -type f -print0 | sort -z) +total=${#files[@]} +if (( total == 0 )); then + echo "No files found in $INPUT_DIR" + exit 1 +fi + +# Compute chunk size (ceil division) +chunk_size=$(( (total + N - 1) / N )) + +# Split into N chunks (skip empties if N > total) +for ((i=0; i total )) && end=$total + (( start >= end )) && continue # skip empty chunk + + out="${OUTPUT_DIR}/id_${i}.txt" + : > "$out" + for ((j=start; j> "$out" + done + echo "Wrote $(wc -l < "$out") paths -> $out" +done + +# -------- Build manifest (stable order) & submit array -------- +echo "Building manifest and submitting array..." + +MANIFEST="${OUTPUT_DIR}/filelist.manifest" +: > "$MANIFEST" + +# Deterministic: sort by name; include only non-empty chunk files +while IFS= read -r -d '' f; do + [[ -s "$f" ]] && realpath -s "$f" >> "$MANIFEST" +done < <(find "$OUTPUT_DIR" -maxdepth 1 -type f -name 'id_*.txt' -print0 | sort -z) + +NUM_TASKS=$(wc -l < "$MANIFEST") +if (( NUM_TASKS == 0 )); then + echo "No non-empty id_*.txt chunk files found; nothing to submit." + exit 0 +fi + +echo "Submitting ${NUM_TASKS} array tasks (max concurrent: ${ARRAY_MAX_CONCURRENCY})..." + +# --parsable returns just the job ID so we can print it nicely +ARRAY_JOB_ID="$( + sbatch --parsable \ + --array=1-"$NUM_TASKS"%${ARRAY_MAX_CONCURRENCY} \ + --export=ALL,MANIFEST="$MANIFEST",BASE_OUTPUT_DIR="$OUTPUT_DIR" \ + single_prediction_array.slrm +)" + +echo "Submitted array job ${ARRAY_JOB_ID} with ${NUM_TASKS} tasks." +echo "Chunks dir: $OUTPUT_DIR" +echo "Manifest: $MANIFEST" +