diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..faca9fc --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +__pycache__/ +.hypothesis/ +.pytest_cache/ +.coverage +SpatialOneHD.log +ENACT.log +cache/* +.ipynb_checkpoints/ +src/.ipynb_checkpoints/ +templates/.ipynb_checkpoints/ +data/ +ENACT_supporting_files/ +ENACT_supporting_files.zip +.idea/* +.checkmarx/scan_results/* +src/cache/* +src/cache-pathologist/* +src/binned_outputs/* +src/binned_outputs-mouse/* \ No newline at end of file diff --git a/ENACT_demo.ipynb b/ENACT_demo.ipynb new file mode 100644 index 0000000..ced9c81 --- /dev/null +++ b/ENACT_demo.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "88dfe185-575f-484b-87a5-662d54a8aa14", + "metadata": {}, + "source": [ + "## ENACT Demo Notebook - Human Colorectal Cancer" + ] + }, + { + "cell_type": "markdown", + "id": "ef3a94f5-4189-4c46-b4fa-570989cb78e9", + "metadata": {}, + "source": [ + "This notebook provides a demo for running ENACT on the Human Colorectal Cancer sample provided on 10X Genomics' website." + ] + }, + { + "cell_type": "markdown", + "id": "31994db6-6997-4124-a4d5-bf09dbf64f69", + "metadata": {}, + "source": [ + "### Download VisiumHD data from the 10X Genomics website" + ] + }, + { + "cell_type": "markdown", + "id": "e56081b4-2eb0-45e4-9f46-7ed118b51551", + "metadata": {}, + "source": [ + "Whole slide image: full resolution tissue image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "712a9e76-d7e1-4cc1-b0ae-afad223a1713", + "metadata": {}, + "outputs": [], + "source": [ + "!curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_tissue_image.btf" + ] + }, + { + "cell_type": "markdown", + "id": "bfa4bd8e-4b4d-4593-b5f2-8cc881c1a2b1", + "metadata": {}, + "source": [ + "Visium HD output file. The transcript counts are provided in a .tar.gz file that needs to be extracted:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f7bc5a4-6f56-4ffa-8b1c-9c178d5c6022", + "metadata": {}, + "outputs": [], + "source": [ + "!curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz\n", + "!tar -xvzf Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz" + ] + }, + { + "cell_type": "markdown", + "id": "c838e47e-0e91-4462-a099-ff061cd4f94f", + "metadata": {}, + "source": [ + "Locate the following two files from the extracted outputs file. These are the files we will use later as input to ENACT.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08ea9a56-14c5-4ebc-bb09-e7535bbc1fee", + "metadata": {}, + "outputs": [], + "source": [ + ".\n", + "└── binned_outputs/\n", + " └── square_002um/\n", + " ├── filtered_feature_bc_matrix.h5 <---- Transcript counts file (2um resolution)\n", + " └── spatial/\n", + " └── tissue_positions.parquet <---- Bin locations relative to the full resolution image\n" + ] + }, + { + "cell_type": "markdown", + "id": "8d760ee8-f5a0-4a0f-ace6-c0b91176f4e1", + "metadata": {}, + "source": [ + "### Install ENACT\n", + "This will install the ENACT package and its dependencies.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d555ae41-0776-4047-bfe2-1ee3ebc475bb", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install enact" + ] + }, + { + "cell_type": "markdown", + "id": "39736162-ba27-435f-8dc6-876b2f507315", + "metadata": {}, + "source": [ + "### Access and update the `configs.yaml` file" + ] + }, + { + "cell_type": "markdown", + "id": "56f81b7a-e58e-498b-8589-0fd9cfc82c08", + "metadata": {}, + "source": [ + "To run the ENACT pipeline, you will need a configuration file that specifies all the required settings. You can download the template configuration file from the GitHub repository.\n", + "\n", + "Refer to [Defining ENACT Configurations](https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact/tree/release/ospo-new?tab=readme-ov-file#defining-enact-configurations) for a full list of parameters to configure." + ] + }, + { + "cell_type": "markdown", + "id": "19207eb1-f22a-48d7-80ba-08b6fc118872", + "metadata": {}, + "source": [ + "#### Step 1\n", + "Download the `configs.yaml` template from the `config` folder of [this repository](https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact), and save it in your working directory." + ] + }, + { + "cell_type": "markdown", + "id": "8996161d-5164-4931-bfdc-ca0065686d44", + "metadata": {}, + "source": [ + "#### Step 2\n", + "Edit the input file locations in `configs.yaml` to the downloaded Visium HD files' location." + ] + }, + { + "cell_type": "markdown", + "id": "ff15bcbd-681e-4947-8277-cffc30f69df4", + "metadata": {}, + "source": [ + "```yaml\n", + "analysis_name: \"demo-colon\" \n", + "cache_dir: \"enact_output\" \n", + "paths:\n", + " wsi_path: \"Visium_HD_Human_Colon_Cancer_tissue_image.btf\" \n", + " visiumhd_h5_path: \"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\" \n", + " tissue_positions_path: \"binned_outputs/square_002um/spatial/tissue_positions.parquet\" \n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "81daa91f-e34e-4018-8a46-89ddb9b6cf99", + "metadata": {}, + "source": [ + "#### Step 3\n", + "Next, we set all the steps in the `configs.yaml` file to `True`, in order to run the whole ENACT pipeline later" + ] + }, + { + "cell_type": "markdown", + "id": "8a4eb5a6-2436-4cf3-8b52-e06d431fc3a0", + "metadata": {}, + "source": [ + "```yaml\n", + "steps:\n", + " segmentation: True \n", + " bin_to_geodataframes: True \n", + " bin_to_cell_assignment: True \n", + " cell_type_annotation: True \n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "4af04737-6ece-431f-b5d5-1eaefe63efca", + "metadata": {}, + "source": [ + "#### Step 4\n", + "Lastly, choose the `bin_to_cell_method` and `cell_annotation_method` we want to run with. In this demo, we will go with `\"weighted_by_area\"`, and `\"celltypist\"`.\n", + "\n", + "To run Celltypist as our cell annotation method, we also need to input the `cell_typist_model` parameter based on the type of sample we use." + ] + }, + { + "cell_type": "markdown", + "id": "9b495d82-cfce-4973-aed0-84aec7d2ac31", + "metadata": {}, + "source": [ + "```yaml\n", + " params:\n", + " bin_to_cell_method: \"weighted_by_area\" \n", + " cell_annotation_method: \"celltypist\" \n", + " cell_typist_model: \"Human_Colorectal_Cancer.pkl\" \n", + " seg_method: \"stardist\" \n", + " patch_size: 4000 \n", + " use_hvg: True \n", + " n_hvg: 1000 \n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "13a165e4-7f63-4cfd-80ed-52a0823692f9", + "metadata": {}, + "source": [ + "### Run ENACT" + ] + }, + { + "cell_type": "markdown", + "id": "2aadbd97-ddf2-4252-9bb0-c59ba4600c4c", + "metadata": {}, + "source": [ + "Running ENACT on the whole sample image will take around 40 minutes. Output of the pipeline will be stored in the `\"enact_output\"` directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "393087e7-9598-4ebe-a628-14cc0ac673a8", + "metadata": {}, + "outputs": [], + "source": [ + "from enact.pipeline import ENACT\n", + "import yaml\n", + "\n", + "configs_path = \"config/configs.yaml\" # Change this to the location of the configs.yaml file that you just edited\n", + "with open(configs_path, \"r\") as stream:\n", + " configs = yaml.safe_load(stream)\n", + "\n", + "so_hd = ENACT(configs)\n", + "so_hd.run_enact()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..1187623 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,7 @@ +**Copyright Sanofi 2024** + +Permission is hereby granted, free of charge, for academic research purposes only and for non-commercial uses only, to any person from academic research or non-profit organizations obtaining a copy of this software and associated documentation files (the "Software"), to use, copy, modify, or merge the Software, subject to the following conditions: this permission notice shall be included in all copies of the Software or of substantial portions of the Software. + +For purposes of this license, “non-commercial use” excludes uses foreseeably resulting in a commercial benefit. To use this software for other purposes (such as the development of a commercial product, including but not limited to software, service, or pharmaceuticals, or in a collaboration with a private company), please contact SANOFI at patent.gos@sanofi.com. + +All other rights are reserved. The Software is provided “as is”, without warranty of any kind, express or implied, including the warranties of noninfringement. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..454d350 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include README.md +include LICENSE.md +include config/configs.yaml + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1f41784 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +ENV_DIR := /home/oneai/envs/ + +PY_ENV_NAME := enact_py_env + +PY_ENV_PATH := $(ENV_DIR)$(PY_ENV_NAME) + +CONFIG_PATH ?= config/configs.yaml + +create-env: + conda create --prefix $(PY_ENV_PATH) python=3.9 + +run_enact: + bash setup_py_env.sh $(PY_ENV_PATH) + bash run_enact.sh $(PY_ENV_PATH) ${CONFIG_PATH} + +setup_py_env: + bash setup_py_env.sh $(PY_ENV_PATH) + +run_cell_ann_eval: + bash setup_py_env.sh $(PY_ENV_PATH) + bash run_cell_ann_eval.sh $(PY_ENV_PATH) + +reproduce_results: + bash setup_py_env.sh $(PY_ENV_PATH) + bash reproduce_paper_results.sh $(PY_ENV_PATH) \ No newline at end of file diff --git a/README.md b/README.md index 92f8eaa..a10d49b 100644 --- a/README.md +++ b/README.md @@ -1 +1,244 @@ -# enact-pipeline \ No newline at end of file +# ENACT: End-to-End Analysis and Cell Type Annotation for Visium High Definition (HD) Slides + +Spatial transcriptomics (ST) enables the study of gene expression within its spatial context in histopathology samples. To date, a limiting factor has been the resolution of sequencing based ST products. The introduction of the Visium High Definition (HD) technology opens the door to cell resolution ST studies. However, challenges remain in the ability to accurately map transcripts to cells and in cell type assignment based on spot data. + +ENACT is the first tissue-agnostic pipeline that integrates advanced cell segmentation with Visium HD transcriptomics data to infer cell types across whole tissue sections. Our pipeline incorporates novel bin-to-cell assignment methods, enhancing the accuracy of single-cell transcript estimates. Validated on diverse synthetic and real datasets, our approach demonstrates high effectiveness at predicting cell types and scalability, offering a robust solution for spatially resolved transcriptomics analysis. + +This repository has the code for inferring cell types from the sub-cellular transcript counts provided by VisiumHD. + +This can be achieved through the following steps: + +1. **Cell segmentation**: segment high resolution image using NN-based image segmentation networks such as Stardist. +2. **Bin-to-cell assignment**: Obtain cell-wise transcript counts by aggregating the VisiumHD bins that are associated with each cell +3. **Cell type inference**: Use the cell-wise transcript counts to infer the cell labels/ phenotypes using methods used for single-cell RNA seq analysis ([CellAsign](https://www.nature.com/articles/s41592-019-0529-1#:~:text=CellAssign%20uses%20a%20probabilistic%20model%20to%20assign%20single) or [CellTypist](https://pubmed.ncbi.nlm.nih.gov/35549406/#:~:text=To%20systematically%20resolve%20immune%20cell%20heterogeneity%20across%20tissues,) or [Sargent](https://www.sciencedirect.com/science/article/pii/S2215016123001966#:~:text=We%20present%20Sargent,%20a%20transformation-free,%20cluster-free,%20single-cell%20annotation) if installed) or novel approaches, and use comprehensive cell marker databases ([Panglao](https://panglaodb.se/index.html) or [CellMarker](http://xteam.xbio.top/CellMarker/) can be used as reference). + +>[!NOTE] +>At this time, Sargent is currently not available in GitHub. For information on how to access [Sargent](https://doi.org/10.1016/j.mex.2023.102196) (doi: https://doi.org/10.1016/j.mex.2023.102196), please contact the paper's corresponding authors (nima.nouri@sanofi.com). We provide the results obtained by Sargent in [ENACT's Zenodo page](https://zenodo.org/records/13887921) under the following folders: +>- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/naive/sargent_results/ +>- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_area/sargent_results/ +>- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_transcript/sargent_results/ +>- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_cluster/sargent_results/ + + +![plot](figs/pipelineflow.png) + +## Index of Instructions: +- [System Requirements](#system-requirements) +- [Input Files for ENACT](#input-files-for-enact) +- [Defining ENACT Configurations](#defining-enact-configurations) +- [Output Files for ENACT](#output-files-for-enact) +- [Running ENACT on VisiumHD Data](#running-enact-on-visiumhd-data) +- [Running ENACT on Public VisiumHD Datasets](#running-enact-on-public-visiumhd-datasets) +- [Reproducing Paper Results](#reproducing-paper-results) +- [Running ENACT from Notebook](#running-enact-from-notebook) +- [Creating Synthetic VisiumHD Datasets](#creating-synthetic-visiumhd-datasets) + +## System Requirements +ENACT was tested with the following specifications: +* Hardware Requirements: 32 CPU, 64GB RAM, 100 GB (harddisk and memory requirements may vary depending on whole slide image size; if the weight of the wsi is small the memory requirements can be significantly decreased) + +* Software: Python 3.9, (Optional) GPU (CUDA 11) + +## Input Files for ENACT +ENACT requires only three files, which can be obtained from SpaceRanger’s outputs for each experiment: + +1. **Whole resolution tissue image**. This will be segmented to obtain the cell boundaries that will be used to aggregate the transcript counts. +2. **tissue_positions.parquet**. This is the file that specifies the *2um* Visium HD bin locations relative to the full resolution image. +3. **filtered_feature_bc_matrix.h5**. This is the .h5 file with the *2um* Visium HD bin counts. + +## Defining ENACT Configurations +All of ENACT's configurations are specified in the `config/configs.yaml`: +```yaml + analysis_name: <---- custom name for analysis. Will create a folder with that name to store the results + run_synthetic: False <---- True if you want to run bin to cell assignment on synthetic dataset, False otherwise + cache_dir: <---- path to store pipeline outputs + paths: + wsi_path: <---- path to whole slide image + visiumhd_h5_path: <---- location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics + tissue_positions_path: <---- location of the tissue of the tissue_positions.parquet file from 10X genomicsgenomics + steps: + segmentation: True <---- True if you want to run segmentation + bin_to_geodataframes: True <---- True to convert bin to geodataframes + bin_to_cell_assignment: True <---- True to bin-to-cell assignment + cell_type_annotation: True <---- True to run cell type annotation + params: + bin_to_cell_method: "weighted_by_cluster" <---- bin-to-cell assignment method. Pick one of ["naive", "weighted_by_area", "weighted_by_gene", "weighted_by_cluster"] + cell_annotation_method: "celltypist" <---- cell annotation method. Pick one of ["cellassign", "celltypist"] + cell_typist_model: "Human_Colorectal_Cancer.pkl" <---- CellTypist model weights to use. Update based on organ of interest if using cell_annotation_method is set to "celltypist" + seg_method: "stardist" <---- cell segmentation method. Stardist is the only option for now + patch_size: 4000 <---- defines the patch size. The whole resolution image will be broken into patches of this size. Reduce if you run into memory issues + use_hvg: True <---- True only run analysis on top n highly variable genes. Setting it to False runs ENACT on all genes in the counts file + n_hvg: 1000 <---- number of highly variable genes to use + n_clusters: 4 <---- number of cell clusters to use for the "weighted_by_cluster" method. Default is 4. + cell_markers: <---- cell-gene markers to use for cell annotation. Only applicable if params/cell_annotation_method is "cellassign" or "sargent" + Epithelial: ["CDH1","EPCAM","CLDN1","CD2"] + Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"] + Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"] +``` + +## Output Files for ENACT +ENACT outputs all its results under the `cache` directory which gets automatically created at run time: +``` +. +└── cache/ + └── / + ├── chunks/ + │ ├── bins_gdf/ + │ │ └── patch_.csv + │ ├── cells_gdf/ + │ │ └── patch_.csv + │ └── / + │ ├── bin_to_cell_assign/ + │ │ └── patch_.csv + │ ├── cell_ix_lookup/ + │ │ └── patch_.csv + │ └── _results/ + │ ├── cells_adata.csv + │ └── merged_results.csv + └── cells_df.csv +``` +ENACT breaks down the whole resolution image into "chunks" (or patches) of size `patch_size`. Results are provided per-chunk under the `chunks` directory. +* bins_gdf: Folder containing GeoPandas dataframes representing the 2um Visium HD bins within a given patch +* cells_gdf: Folder containing GeoPandas dataframes representing cells segmented in the tissue +* /bin_to_cell_assign: Folder contains dataframes with the transcripts assigned to each cells +* /cell_ix_lookup: Folder contains dataframes defining the indices and coordinates of the cells +* /_results/cells_adata.csv: Anndata object containing the results from ENACT (cell coordinates, cell types, transcript counts) +* /_results/merged_results.csv: Dataframe (.csv) containing the results from ENACT (cell coordinates, cell types) + +## Running ENACT on VisiumHD Data +This section provides a guide on running ENACT on your own data +### Step 1: Clone ENACT repository +``` +git clone https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact.git +cd oneai-dda-spatialtr-enact +``` +### Step 2: Setup Python environment +Start by defining the location and the name of the Conda environment in the `Makefile`: +``` +ENV_DIR := /home/oneai/envs/ <---- Conda environment location +PY_ENV_NAME := enact_py_env <---- Conda environment name +``` +Next, run the following Make command to create a Conda environment with all of ENACT's dependencies +``` +make setup_py_env +``` + +### Step 3: Define the Location of ENACT's Required Files +Define the locations of ENACT's required files in the `config/configs.yaml` file. Refer to [Input Files for ENACT](#input-files-for-enact) +```yaml + analysis_name: <---- custom name for analysis. Will create a folder with that name to store the results + cache_dir: <---- path to store pipeline outputs + paths: + wsi_path: <---- path to whole slide image + visiumhd_h5_path: <---- location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics. + tissue_positions_path: <---- location of the tissue of the tissue_positions.parquet file from 10X genomics +``` + +### Step 4: Define ENACT configurations +Define the following core parameters in the `config/configs.yaml` file: +```yaml + params: + bin_to_cell_method: "weighted_by_cluster" <---- bin-to-cell assignment method. Pick one of ["naive", "weighted_by_area", "weighted_by_gene", "weighted_by_cluster"] + cell_annotation_method: "celltypist" <---- cell annotation method. Pick one of ["cellassign", "celltypist", "sargent" (if installed)] + cell_typist_model: "Human_Colorectal_Cancer.pkl" <---- CellTypist model weights to use. Update based on organ of interest if using cell_annotation_method is set to +``` +Refer to [Defining ENACT Configurations](#defining-enact-configurations) for a full list of parameters to configure. If using CellTypist, set `cell_typist_model` to one of the following models based on the organ and species under study: [CellTypist models](https://www.celltypist.org/models#:~:text=CellTypist%20was%20first%20developed%20as%20a%20platform%20for). + +### Step 5: Define Cell Gene Markers (Only applies for cell_annotation_method is "cellassign" or "sargent") +Define the cell gene markers in `config/configs.yaml` file. Those can be expert annotated or obtained from open-source databases such as [Panglao](https://panglaodb.se/index.html) or [CellMarker](http://xteam.xbio.top/CellMarker/). Example cell markers for human colorectal cancer samples: +```yaml + cell_markers: + Epithelial: ["CDH1","EPCAM","CLDN1","CD2"] + Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"] + Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"] + Enteroendocrine cells: ["NUCB2", "FABP5", "CPE", "ALCAM", "GCG", "SST", "CHGB", "IAPP", "CHGA", "ENPP2"] + Crypt cells: ["HOPX", "SLC12A2", "MSI1", "SMOC2", "OLFM4", "ASCL2", "PROM1", "BMI1", "EPHB2", "LRIG1"] + Endothelial: ["PECAM1","CD34","KDR","CDH5","PROM1","PDPN","TEK","FLT1","VCAM1","PTPRC","VWF","ENG","MCAM","ICAM1","FLT4"] + Fibroblast: ["COL1A1","COL3A1","COL5A2","PDGFRA","ACTA2","TCF21","FN"] + Smooth muscle cell: ["BGN","MYL9","MYLK","FHL2","ITGA1","ACTA2","EHD2","OGN","SNCG","FABP4"] + B cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"] + T cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"] + NK cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"] +``` +### Step 6: Run ENACT +``` +make run_enact +``` + +## Running ENACT on Public VisiumHD Datasets +This section provides a guide for running ENACT on the [Human Colorectal Cancer sample](https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-human-crc) provided on 10X Genomics' website. +### Step 1: Download the necessary files from the 10X Genomics website: + +1. Whole slide image: full resolution tissue image +``` +curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_tissue_image.btf +``` + +2. Visium HD output file. The transcript counts are provided in a .tar.gz file that needs to be extracted: +``` +curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz +tar -xvzf Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz +``` +Locate the following two files from the extracted outputs file. +``` +. +└── binned_outputs/ + └── square_002um/ + ├── filtered_feature_bc_matrix.h5 <---- Transcript counts file (2um resolution) + └── spatial/ + └── tissue_positions.parquet <---- Bin locations relative to the full resolution image +``` + +### Step 2: Update input file locations under `config/configs.yaml` + +```yaml + analysis_name: "colon-demo" <---- custom name for analysis. Will create a folder with that name to store the results + paths: + wsi_path: "/Visium_HD_Human_Colon_Cancer_tissue_image.btf" <---- whole slide image path + visiumhd_h5_path: "/binned_outputs/square_002um/filtered_feature_bc_matrix.h5" <---- location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics. + tissue_positions_path: "/binned_outputs/square_002um/spatial/tissue_positions.parquet" <---- location of the tissue of the tissue_positions.parquet file from 10X genomics\ +``` + +### Step 3: Follow the steps under [Running ENACT on VisiumHD Data](#running-enact-on-visiumhd-data) + +## Reproducing Paper Results +This section provides a guide on how to reproduce the ENACT paper results on the [10X Genomics Human Colorectal Cancer VisumHD sample](https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-human-crc). +### Step 1: Clone ENACT repository +``` +git clone https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact.git +cd oneai-dda-spatialtr-enact +``` +### Step 2: Setup Python environment +Start by defining the location and the name of the Conda environment in the `Makefile`: +``` +ENV_DIR := /home/oneai/envs/ <---- Conda environment location +PY_ENV_NAME := enact_py_env <---- Conda environment name +``` +Next, run the following Make command to create a Conda environment with all of ENACT's dependencies +``` +make setup_py_env +``` +3. Run the following command which will download all the supplementary filez from [ENACT's Zenodo page](https://zenodo.org/records/13887921) and programmatically run ENACT with various combinations of bin-to-cell assignment methods and cell annotation algorithms: +``` +make reproduce_results +``` +## Running ENACT from Notebook +The [demo notebook](ENACT_demo.ipynb) provides a step-by-step guide on how to run ENACT on VisiumHD public data using notebook. + +## Creating Synthetic VisiumHD Datasets + +1. To create synthetic VisiumHD dataset from Xenium or seqFISH+ data, run and follow the instructions of the notebooks in [src/synthetic_data](src/synthetic_data). + +2. To run the ENACT pipeline with the synthetic data, set the following parameters in the `config/configs.yaml` file: + +```yaml +run_synthetic: True <---- True if you want to run bin to cell assignment on synthetic dataset, False otherwise. +``` + +3. Run ENACT: +``` +make run_enact +``` diff --git a/config/configs.yaml b/config/configs.yaml new file mode 100644 index 0000000..d72f58d --- /dev/null +++ b/config/configs.yaml @@ -0,0 +1,50 @@ +analysis_name: "colon-demo" +run_synthetic: False # True if you want to run bin to cell assignment on synthetic dataset, False otherwise. +cache_dir: "//enact-pipeline/ENACT_supporting_files/output_files" +paths: + wsi_path: "//enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/Visium_HD_Human_Colon_Cancer_tissue_image.btf" + visiumhd_h5_path: "//enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/filtered_feature_bc_matrix.h5" + tissue_positions_path: "//enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/tissue_positions.parquet" +steps: + segmentation: True # True if you want to run segmentation + bin_to_geodataframes: True # True to convert bin to geodataframes + bin_to_cell_assignment: True # True to assign cells to bins + cell_type_annotation: True # True to run cell type annotation +params: + seg_method: "stardist" # Stardist is the only option for now + patch_size: 4000 # Defines the patch size. The whole resolution image will be broken into patches of this size + bin_representation: "polygon" # or point TODO: Remove support for anything else + bin_to_cell_method: "weighted_by_cluster" # or naive + cell_annotation_method: "celltypist" + cell_typist_model: "Human_Colorectal_Cancer.pkl" + use_hvg: True # Only run analysis on highly variable genes + cell markers specified + n_hvg: 1000 # Number of highly variable genes to use + n_clusters: 4 + chunks_to_run: [ + # "patch_2_0.csv", + # "patch_2_1.csv", + ] +cell_markers: + # # Human Colon + Epithelial: ["CDH1","EPCAM","CLDN1","CD2"] + Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"] + Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"] + Enteroendocrine cells: ["NUCB2", "FABP5", "CPE", "ALCAM", "GCG", "SST", "CHGB", "IAPP", "CHGA", "ENPP2"] + Crypt cells: ["HOPX", "SLC12A2", "MSI1", "SMOC2", "OLFM4", "ASCL2", "PROM1", "BMI1", "EPHB2", "LRIG1"] + Endothelial: ["PECAM1","CD34","KDR","CDH5","PROM1","PDPN","TEK","FLT1","VCAM1","PTPRC","VWF","ENG","MCAM","ICAM1","FLT4"] + Fibroblast: ["COL1A1","COL3A1","COL5A2","PDGFRA","ACTA2","TCF21","FN"] + Smooth muscle cell: ["BGN","MYL9","MYLK","FHL2","ITGA1","ACTA2","EHD2","OGN","SNCG","FABP4"] + B cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"] + T cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"] + NK cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"] + + # # Mouse intestine + # Enterocytes: ["Cbr1", "Plin2", "Gls", "Plin3", "Dab1", "Pmepa1", "Acsl5", "Hmox1", "Abcg2", "Cd36"] + # Goblet cells: ["Manf", "Krt7", "Ccl9", "Muc13", "Phgr1", "Cdx2", "Aqp3", "Creb3L1", "Guca2A", "Klk1"] + # Enteroendocrine cells: ["Fabp5", "Cpe", "Enpp2", "Chgb", "Alcam", "Chga", "Pax6", "Neurod1", "Cck", "Isl1"] + # Paneth cells: ["Gpx2", "Fabp4", "Lyz1", "Kcnn4", "Lgals2", "Guca2B", "Lgr4", "Defa24", "Il4Ra", "Guca2A"] + # Crypt cells: ["Prom1", "Hopx", "Msi1", "Olfm4", "Kcne3", "Bmi1", "Axin2", "Kcnq1", "Ascl2", "Lrig1"] + # Smooth muscle cells: ["Bgn", "Myl9", "Pcp4L1", "Itga1", "Nrp2", "Mylk", "Ehd2", "Fabp4", "Acta2", "Ogn"] + # B cells: ["Cd52", "Bcl11A", "Ebf1", "Cd74", "Ptprc", "Pold4", "Ighm", "Cd14", "Creld2", "Fli1"] + # T cells: ["Cd81", "Junb", "Cd52", "Ptprcap", "H2-Q7", "Ccl6", "Bcl2", "Maff", "Ccl4", "Ccl3"] + # NK cells: ["Ctla2A", "Ccl4", "Cd3G", "Ccl3", "Nkg7", "Lat", "Dusp2", "Itgam", "Fhl2", "Ccl5"] diff --git a/figs/pipelineflow.png b/figs/pipelineflow.png new file mode 100644 index 0000000..432e83d Binary files /dev/null and b/figs/pipelineflow.png differ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b0da4e0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,73 @@ +[build-system] +requires = ["hatchling", "setuptools", "wheel"] +build-backend = "hatchling.build" + +[project] +name = "enact" +version = "0.1" +description = "ENACT is the first tissue-agnostic pipeline that integrates advanced cell segmentation with Visium HD transcriptomics data to infer cell types across whole tissue sections. This pipeline incorporates novel bin-to-cell assignment methods, enhancing the accuracy of single-cell transcript estimates." +license ={ file = "LICENSE.md" } +readme = "README.md" +requires-python = ">=3.9" +keywords = ["spatial", "omics", "bioinformatics", "transcriptomics", "VisiumHD", ] +authors = [ + { name = "Mena Kamel", email = "mena.kamel@sanofi.com" }, + { name = "Yiwen Song", email = "yiwen.song@sanofi.com" }, +] +classifiers = [ + + "Programming Language :: Python", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +# Core dependencies required for running the ENACT pipeline +dependencies = [ + "anndata==0.10.8", + "fastparquet==2024.5.0", + "shapely==2.0.5", + "stardist==0.9.1", + "tifffile==2024.7.24", + "scvi-tools==1.1.6", + "scanpy==1.10.2", + "geopandas==1.0.1", + "tensorflow==2.17.0", + "plotly==5.24.0", + "imagecodecs==2024.9.22", + "pyyaml==6.0", + "pandas", + "numpy", + "tqdm", + "Pillow", + "scipy", + "celltypist-SO==1.6.5" +] + +# Documentation and other URLs related to the project +[project.urls] +Documentation = "https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact#readme" +Issues = "" +Source = "https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact" + +# Scripts and linting tools +[tool.hatch.scripts] +check = "mypy --install-types --non-interactive {args:src/enact tests}" + +[tool.hatch.build.targets.wheel] +packages = ["src/enact"] + +[tool.setuptools.packages.find] +where = ["src"] +include = ["enact"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if TYPE_CHECKING:", +] + +# Include important files like README and LICENSE +[tool.setuptools] +include-package-data = true \ No newline at end of file diff --git a/reproduce_paper_results.sh b/reproduce_paper_results.sh new file mode 100644 index 0000000..fa5939e --- /dev/null +++ b/reproduce_paper_results.sh @@ -0,0 +1,32 @@ +#!/bin/bash +eval "$(conda shell.bash hook)" + +set -e + +PY_ENV_PATH=$1 + +conda activate $PY_ENV_PATH + +FILE_URL="https://zenodo.org/records/13887921/files/ENACT_supporting_files.zip" +OUTPUT_FILE="ENACT_supporting_files.zip" + +# Download ENACT supporting files if they are not present +if [ -f "$OUTPUT_FILE" ]; then + echo "$OUTPUT_FILE already exists. Skipping download." +else + echo "$OUTPUT_FILE is downloading." + wget -O $OUTPUT_FILE $FILE_URL + unzip $OUTPUT_FILE +fi + + +# Need to add step to download files from Zenodo to ENACT_supporting_files (in repo home directory) +# Run ENACT pipeline to test all combinations of Bin-to-cell assignment and cell annotation methods - Order of experiments matters, don't change! +python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/naive-celltypist.yaml +python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/naive-cellassign.yaml +python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_area-celltypist.yaml +python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_area-cellassign.yaml +python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_transcript-celltypist.yaml +python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_transcript-cellassign.yaml +python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_cluster-celltypist.yaml +python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_cluster-cellassign.yaml diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4f9ea76 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +anndata==0.10.8 +fastparquet==2024.5.0 +shapely==2.0.5 +stardist==0.9.1 +tifffile==2024.7.24 +scvi-tools==1.1.6 +celltypist-SO==1.6.5 +scanpy==1.10.2 +geopandas==1.0.1 +tensorflow==2.17.0 +plotly==5.24.0 +imagecodecs==2024.9.22 + +pytest==7.3.2 +pytest-cov==4.1.0 diff --git a/run_cell_ann_eval.sh b/run_cell_ann_eval.sh new file mode 100644 index 0000000..6dc35c4 --- /dev/null +++ b/run_cell_ann_eval.sh @@ -0,0 +1,11 @@ +#!/bin/bash +eval "$(conda shell.bash hook)" + +set -e + +PY_ENV_PATH=$1 + +# Run ENACT pipeline +conda activate $PY_ENV_PATH +python -m src.eval.cell_annotation_eval + diff --git a/run_enact.sh b/run_enact.sh new file mode 100644 index 0000000..12d2b1d --- /dev/null +++ b/run_enact.sh @@ -0,0 +1,11 @@ +#!/bin/bash +eval "$(conda shell.bash hook)" + +set -e + +PY_ENV_PATH=$1 +CONFIG_PATH=$2 + +# Run ENACT pipeline +conda activate $PY_ENV_PATH +python -m src.enact.pipeline --configs_path "$CONFIG_PATH" \ No newline at end of file diff --git a/setup_py_env.sh b/setup_py_env.sh new file mode 100644 index 0000000..5711e3e --- /dev/null +++ b/setup_py_env.sh @@ -0,0 +1,14 @@ +#!/bin/bash +eval "$(conda shell.bash hook)" + +set -e + +PY_ENV_PATH=$1 + +# Create Python environment +if ! conda info --envs | grep -q "$PY_ENV_PATH"; then + echo "Environment $PY_ENV_PATH does not exist. Creating..." + conda create --prefix $PY_ENV_PATH python=3.9 + conda activate $PY_ENV_PATH + pip install -r requirements.txt +fi diff --git a/src/enact/__init__.py b/src/enact/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/enact/assignment_methods/__init__.py b/src/enact/assignment_methods/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/enact/assignment_methods/naive.py b/src/enact/assignment_methods/naive.py new file mode 100644 index 0000000..855ccad --- /dev/null +++ b/src/enact/assignment_methods/naive.py @@ -0,0 +1,8 @@ +# Naive method. Only using the bins unique to each cell (overlapping bins omitted) + + +def naive_assignment(result_spatial_join): + # Naive method. Only using the bins unique to each cell (overlapping bins omitted) + result_spatial_join = result_spatial_join[result_spatial_join["unique_bin"]] + result_spatial_join["weight"] = 1 + return result_spatial_join diff --git a/src/enact/assignment_methods/weight_by_area.py b/src/enact/assignment_methods/weight_by_area.py new file mode 100644 index 0000000..a3fdbb1 --- /dev/null +++ b/src/enact/assignment_methods/weight_by_area.py @@ -0,0 +1,46 @@ +# Weighted by area method +import anndata +import numpy as np +from scipy import sparse + + +def apply_weights_to_adata_counts(adata): + """Applies the weights to the counts matrix + + Args: + adata (AnnData): Counts AnnData + + Returns: + AnnData: Weighted-adjusted AnnData + """ + weight = adata.obs["weight"] + # Reshape weights to (130000, 1) for broadcasting + weight = np.array(weight) + weight = weight[:, np.newaxis] + + # OPTIMIZATION + # Perform element-wise multiplication + weighted_counts = adata.X.multiply(weight) + + # convert back to sparse + adata.X = sparse.csr_matrix(weighted_counts) + return adata + + +def weight_by_area_assignment(result_spatial_join, expanded_adata, cell_gdf_chunk): + # Calculate overlapping area between cell and bin + result_spatial_join["area"] = result_spatial_join.apply( + lambda row: row["geometry"] + .intersection(cell_gdf_chunk.loc[row["index_right"], "geometry"]) + .area, + axis=1, + ) + bin_area = result_spatial_join.iloc[0]["geometry"].area + result_spatial_join["weight"] = result_spatial_join["area"] / bin_area + result_spatial_join.loc[ + result_spatial_join["unique_bin"], + "weight", + ] = 1 + expanded_adata.obs["weight"] = result_spatial_join["weight"].tolist() + expanded_adata = apply_weights_to_adata_counts(expanded_adata) + return result_spatial_join, expanded_adata diff --git a/src/enact/assignment_methods/weight_by_gene.py b/src/enact/assignment_methods/weight_by_gene.py new file mode 100644 index 0000000..a3d55cc --- /dev/null +++ b/src/enact/assignment_methods/weight_by_gene.py @@ -0,0 +1,151 @@ +# Weighted by area method +import anndata +import numpy as np +import pandas as pd +from scipy import sparse +from tqdm import tqdm +from sklearn.cluster import KMeans + + +def apply_weights_to_adata_counts(expanded_adata, weights_df): + """Applies the weights to the counts matrix + + Args: + adata (AnnData): Counts AnnData + + Returns: + AnnData: Weighted-adjusted AnnData + """ + if weights_df.empty: + return expanded_adata + # Applying the weighting + mask = (expanded_adata.obs_names.isin(weights_df.index)) & ( + expanded_adata.obs["id"].isin(weights_df["id"]) + ) + indices = np.where(mask)[0] + # Apply weights to the entries in the expression matrix + weights_matrix = np.ones(expanded_adata.shape) + for idx in tqdm(indices, total=len(indices)): + bin_id = expanded_adata.obs.iloc[idx]["index"] + cell_id = expanded_adata.obs.iloc[idx]["id"] + bin_rows = weights_df.loc[bin_id] + weights = bin_rows[bin_rows["id"] == cell_id][expanded_adata.var_names] + weights_matrix[idx] = weights.iloc[0].tolist() + weighted_counts = expanded_adata.X.multiply(weights_matrix) + # convert back to sparse + expanded_adata.X = sparse.csr_matrix(weighted_counts) + return expanded_adata + + +def weight_by_gene_assignment( + result_spatial_join, expanded_adata, unique_cell_by_gene_adata +): + # Getting the gene counts of the cells (unique signature for each cell) + gene_counts_non_overlap = ( + pd.DataFrame( + unique_cell_by_gene_adata.X.toarray(), + index=unique_cell_by_gene_adata.obs_names, + columns=unique_cell_by_gene_adata.var_names, + ) + .groupby(unique_cell_by_gene_adata.obs["id"]) + .sum() + .reset_index() + ) + + # Getting the bins that overlap with multiple cells + overlapping_bins = result_spatial_join[~result_spatial_join["unique_bin"]] + + # Getting a table of bins with the parent cell and the parent cell's gene content + overlap_merge = pd.merge( + overlapping_bins[["index", "id"]], gene_counts_non_overlap, on="id", how="left" + ) + overlap_merge.set_index("index", inplace=True) + + # Grouping the bins by the bin id + grouped_overlap = overlap_merge.groupby("index") + + # Initialize progress bar for processing overlapping bins + pbar = tqdm(grouped_overlap, desc="Processing overlapping bins", unit="bin") + gene_columns = overlap_merge.columns.drop(["id"]).tolist() + weights_list = [] + # Looping through the bins and splitting the counts + for bin_index, group_rows in pbar: + # getting total gene counts from the cells that share a bin + gene_total = group_rows[gene_columns].sum(axis=0) + # Dividing the cells gene counts by the total gene counts to get the weight + gene_weights = group_rows[gene_columns].div(gene_total, axis=1).fillna(0) + gene_weights["id"] = group_rows["id"] + weights_list.append(gene_weights) + # Getting a weights dataframe + if weights_list: + weights_df = pd.concat(weights_list, axis=0) + else: + weights_df = pd.DataFrame() + pbar.close() + expanded_adata = apply_weights_to_adata_counts(expanded_adata, weights_df) + return result_spatial_join, expanded_adata + + +def weight_by_cluster_assignment( + result_spatial_join, expanded_adata, unique_cell_by_gene_adata, n_clusters=4 +): + # Getting the gene counts of the cells (unique signature for each cell) + gene_counts_non_overlap = ( + pd.DataFrame( + unique_cell_by_gene_adata.X.toarray(), + index=unique_cell_by_gene_adata.obs_names, + columns=unique_cell_by_gene_adata.var_names, + ) + .groupby(unique_cell_by_gene_adata.obs["id"]) + .sum() + .reset_index() + ) + + # Getting the bins that overlap with multiple cells + overlapping_bins = result_spatial_join[~result_spatial_join["unique_bin"]] + + gene_columns = gene_counts_non_overlap.columns.drop(["id"]).tolist() + + # clustering on gene counts from non-overlapping bins + n_clusters = np.min([n_clusters, len(gene_counts_non_overlap)]) + kmeans = KMeans(n_clusters=n_clusters, random_state=0) + clusters = kmeans.fit_predict(gene_counts_non_overlap[gene_columns]) + gene_counts_non_overlap["cluster"] = clusters + cluster_means = gene_counts_non_overlap.groupby("cluster")[gene_columns].mean() + + # Getting a table of bins with the parent cell and the parent cell's gene content + overlap_merge = pd.merge( + overlapping_bins[["index", "id"]], gene_counts_non_overlap, on="id", how="left" + ) + # merge cluster mean gene counts with overlapping bins - + # using cluster gene counts instead of the bins's gene counts + overlap_merge = pd.merge( + overlap_merge[["index", "id", "cluster"]], + cluster_means, + left_on="cluster", + right_index=True, + how="left", + ) + overlap_merge.set_index("index", inplace=True) + + grouped_overlap = overlap_merge.groupby("index") + + # Initialize progress bar for processing overlapping bins + pbar = tqdm(grouped_overlap, desc="Processing overlapping bins", unit="bin") + weights_list = [] + # Looping through the bins and splitting the counts + for bin_index, group_rows in pbar: + # getting total gene counts from the cells that share a bin + gene_total = group_rows[gene_columns].sum(axis=0) + # Dividing the cells gene counts by the total gene counts to get the weight + gene_weights = group_rows[gene_columns].div(gene_total, axis=1).fillna(0) + gene_weights["id"] = group_rows["id"] + weights_list.append(gene_weights) + # Getting a weights dataframe + if weights_list: + weights_df = pd.concat(weights_list, axis=0) + else: + weights_df = pd.DataFrame() + pbar.close() + expanded_adata = apply_weights_to_adata_counts(expanded_adata, weights_df) + return result_spatial_join, expanded_adata diff --git a/src/enact/cellassign.py b/src/enact/cellassign.py new file mode 100644 index 0000000..8f6bda2 --- /dev/null +++ b/src/enact/cellassign.py @@ -0,0 +1,88 @@ +"""Class for defining methods to package pipeline outputs into AnnData objects +""" + +import os +import pandas as pd +import anndata +import scanpy as sc +import scvi +import seaborn as sns +from scvi.external import CellAssign +import numpy as np +import torch + +from .pipeline import ENACT + +seed = 42 + + +class CellAssignPipeline(ENACT): + """Class for running CellAssign algorithm""" + + def __init__(self, configs): + super().__init__(configs) + self.configs = configs + + def format_markers_to_df(self): + """Method to format marker genes to a pandas dataframe + num gene x num cell_types + """ + markers_dict = self.configs["cell_markers"] + genes_set = set([item for sublist in markers_dict.values() for item in sublist]) + markers_df = pd.DataFrame(columns=markers_dict.keys(), index=sorted(genes_set)) + markers_df = markers_df.fillna(0) + for cell_type, gene_markers in markers_dict.items(): + markers_df.loc[gene_markers, cell_type] = 1 + self.markers_df = markers_df + + def run_cell_assign(self): + """Runs CellAssign""" + bin_assign_df = self.merge_files(self.bin_assign_dir, save=False) + cell_lookup_df = self.merge_files(self.cell_ix_lookup_dir, save=False) + + spatial_cols = ["cell_x", "cell_y"] + stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"] + cell_lookup_df.loc[:, "id"] = cell_lookup_df["id"].astype(str) + cell_lookup_df = cell_lookup_df.set_index("id") + cell_lookup_df["num_transcripts"] = cell_lookup_df["num_transcripts"].fillna(0) + + bin_assign_df.index = cell_lookup_df.index + bin_assign_df = bin_assign_df.drop(columns=["Unnamed: 0"]) + bin_assign_df = bin_assign_df.fillna(0).astype(int) + adata = anndata.AnnData(bin_assign_df.astype(int)) + + adata.obsm["spatial"] = cell_lookup_df[spatial_cols].astype(int) + adata.obsm["stats"] = cell_lookup_df[stat_columns].astype(int) + + lib_size = adata.X.sum(1) + adata.obs["size_factor"] = lib_size / np.mean(lib_size) + adata.obs["lib_size"] = lib_size + + marker_gene_mat = self.markers_df.copy() + marker_gene_mat = marker_gene_mat.loc[ + sorted(list(set(self.markers_df.index) & set(bin_assign_df.columns))) + ] + bdata = adata[:, marker_gene_mat.index].copy() + + torch.manual_seed(seed) + scvi.external.CellAssign.setup_anndata(bdata, size_factor_key="size_factor") + model = CellAssign(bdata, marker_gene_mat, random_b_g_0=False) + model.train() + predictions = model.predict() + + bdata.obs["cell_type"] = predictions.idxmax(axis=1).values + bdata.obs[adata.obsm["spatial"].columns] = adata.obsm["spatial"] + bdata.obs[adata.obsm["stats"].columns] = adata.obsm["stats"] + bdata.obs["chunk_name"] = cell_lookup_df["chunk_name"] + bdata.obs.to_csv( + os.path.join(self.cellannotation_results_dir, "merged_results.csv") + ) + print( + f"saved to : {os.path.join(self.cellannotation_results_dir, 'merged_results.csv')}" + ) + + +if __name__ == "__main__": + # Creating CellAssignPipeline object + cell_assign = CellAssignPipeline(configs_path="config/configs.yaml") + cell_assign.format_markers_to_df() diff --git a/src/enact/celltypist.py b/src/enact/celltypist.py new file mode 100644 index 0000000..da7536c --- /dev/null +++ b/src/enact/celltypist.py @@ -0,0 +1,72 @@ +"""Class for defining methods to package pipeline outputs into AnnData objects +""" + +import os +import pandas as pd +import anndata +import scanpy as sc +import seaborn as sns +import numpy as np + +## Attempt to import celltypist, and prompt installation if not found +import celltypist +from celltypist import models + +from .pipeline import ENACT + + +class CellTypistPipeline(ENACT): + """Class for running CellAssign algorithm""" + + def __init__(self, configs): + super().__init__(configs) + + def run_cell_typist(self): + """Runs CellTypist""" + bin_assign_df = self.merge_files(self.bin_assign_dir, save=False) + cell_lookup_df = self.merge_files(self.cell_ix_lookup_dir, save=False) + + spatial_cols = ["cell_x", "cell_y"] + stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"] + cell_lookup_df.loc[:, "id"] = cell_lookup_df["id"].astype(str) + cell_lookup_df = cell_lookup_df.set_index("id") + cell_lookup_df["num_transcripts"] = cell_lookup_df["num_transcripts"].fillna(0) + + bin_assign_df.index = cell_lookup_df.index + bin_assign_df = bin_assign_df.drop(columns=["Unnamed: 0"]) + bin_assign_df = bin_assign_df.fillna(0).astype(int) + adata = anndata.AnnData(bin_assign_df.astype(int)) + + adata.obsm["spatial"] = cell_lookup_df[spatial_cols].astype(int) + adata.obsm["stats"] = cell_lookup_df[stat_columns].astype(int) + adata.obs + + lib_size = adata.X.sum(1) + adata.obs["size_factor"] = lib_size / np.mean(lib_size) + adata.obs["lib_size"] = lib_size + + # normalize adata to the log1p normalised format (to 10,000 counts per cell) + sc.pp.normalize_total(adata, target_sum=1e4) + sc.pp.log1p(adata) + + # download celltypist model and predict cell type + models.download_models(model=self.cell_typist_model) + predictions = celltypist.annotate(adata, model=self.cell_typist_model) + adata = predictions.to_adata( + insert_labels=True, insert_conf=True, insert_prob=True + ) + + adata.obs.rename(columns={"predicted_labels": "cell_type"}, inplace=True) + adata.obs[adata.obsm["spatial"].columns] = adata.obsm["spatial"] + adata.obs[adata.obsm["stats"].columns] = adata.obsm["stats"] + adata.obs["chunk_name"] = cell_lookup_df["chunk_name"] + results_df = adata.obs.drop(columns=adata.obs["cell_type"].unique().tolist()) + results_df.to_csv( + os.path.join(self.cellannotation_results_dir, "merged_results.csv") + ) + + +if __name__ == "__main__": + # Creating CellAssignPipeline object + cell_typist = CellTypistPipeline(configs_path="config/configs.yaml") + cell_typist.run_cell_typist() diff --git a/src/enact/package_results.py b/src/enact/package_results.py new file mode 100644 index 0000000..0c5331b --- /dev/null +++ b/src/enact/package_results.py @@ -0,0 +1,147 @@ +"""Class for defining methods to package pipeline outputs into AnnData objects +""" + +import os + +import anndata +import pandas as pd + +# import squidpy as sq + +from .pipeline import ENACT + + +class PackageResults(ENACT): + """Class for packaging ENACT pipeline outputs""" + + def __init__(self, configs): + super().__init__(configs) + self.files_to_ignore = [ + "merged_results.csv", + "merged_results_old.csv", + "cells_adata.h5", + ".ipynb_checkpoints", + ] + self.configs = configs + + def merge_cellassign_output_files(self): + """Merges the CellAssign results with gene counts + + Returns: + _type_: _description_ + """ + chunks = os.listdir(self.bin_assign_dir) + cell_by_gene_list = [] + for chunk_name in chunks: + if chunk_name in self.files_to_ignore: + continue + index_lookup = pd.read_csv( + os.path.join(self.cell_ix_lookup_dir, chunk_name) + ) + trancript_counts = pd.read_csv( + os.path.join(self.bin_assign_dir, chunk_name) + ).drop(columns=["Unnamed: 0"]) + cell_by_gene_chunk = pd.concat( + [index_lookup["id"], trancript_counts], axis=1 + ) + cell_by_gene_list.append(cell_by_gene_chunk) + cell_by_gene_df = pd.concat(cell_by_gene_list, axis=0) + return cell_by_gene_df + + def merge_sargent_output_files(self): + """Merges the Sargent chunk results into a single results file + + Returns: + _type_: _description_ + """ + os.makedirs(self.sargent_results_dir, exist_ok=True) + # Merge the sargent_results_chunks data and gene_to_cell_assignment_chunks_ix_lookup + chunks = os.listdir(self.sargent_results_dir) + sargent_results_list = [] + cell_by_gene_list = [] + for chunk_name in chunks: + if chunk_name in self.files_to_ignore: + continue + cell_labels = pd.read_csv( + os.path.join(self.sargent_results_dir, chunk_name) + ) + index_lookup = pd.read_csv( + os.path.join(self.cell_ix_lookup_dir, chunk_name) + ) + trancript_counts = pd.read_csv( + os.path.join(self.bin_assign_dir, chunk_name) + ).drop(columns=["Unnamed: 0"]) + + sargent_result_chunk = pd.concat([index_lookup, cell_labels["x"]], axis=1) + cell_by_gene_chunk = pd.concat( + [index_lookup["id"], trancript_counts], axis=1 + ) + sargent_result_chunk.drop("Unnamed: 0", axis=1, inplace=True) + sargent_results_list.append(sargent_result_chunk) + cell_by_gene_list.append(cell_by_gene_chunk) + sargent_results_df = pd.concat(sargent_results_list, axis=0) + sargent_results_df = sargent_results_df.rename(columns={"x": "cell_type"}) + cell_by_gene_df = pd.concat(cell_by_gene_list, axis=0) + sargent_results_df.to_csv( + os.path.join(self.sargent_results_dir, "merged_results.csv"), index=False + ) + return sargent_results_df, cell_by_gene_df + + def df_to_adata(self, results_df, cell_by_gene_df): + """Converts pd.DataFrame object with pipeline results to AnnData + + Args: + results_df (_type_): _description_ + + Returns: + anndata.AnnData: Anndata with pipeline outputs + """ + file_columns = results_df.columns + spatial_cols = ["cell_x", "cell_y"] + stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"] + results_df.loc[:, "id"] = results_df["id"].astype(str) + results_df = results_df.set_index("id") + results_df["num_transcripts"] = results_df["num_transcripts"].fillna(0) + results_df["cell_type"] = results_df["cell_type"].str.lower() + adata = anndata.AnnData(cell_by_gene_df.set_index("id").astype(int)) + + # adata = anndata.AnnData(results_df[stat_columns].astype(int)) + adata.obsm["spatial"] = results_df[spatial_cols].astype(int) + adata.obsm["stats"] = results_df[stat_columns].astype(int) + # This column is the output of cell type inference pipeline + adata.obs["cell_type"] = results_df[["cell_type"]].astype("category") + adata.obs["patch_id"] = results_df[["chunk_name"]] + return adata + + # def run_neighborhood_enrichment(self, adata): + # """Sample function to run Squidpy operations on AnnData object + + # Args: + # adata (_type_): _description_ + + # Returns: + # _type_: _description_ + # """ + # sq.gr.spatial_neighbors(adata) + # sq.gr.nhood_enrichment(adata, cluster_key="cell_type") + # return adata + + def save_adata(self, adata): + """Save the anndata object to disk + + Args: + adata (_type_): _description_ + """ + adata.write( + os.path.join(self.cellannotation_results_dir, "cells_adata.h5"), + compression="gzip", + ) + + +if __name__ == "__main__": + # Creating ENACT object + so_hd = PackageResults(configs_path="config/configs.yaml") + results_df, cell_by_gene_df = so_hd.merge_sargent_output_files() + adata = so_hd.df_to_adata(results_df, cell_by_gene_df) + # adata = so_hd.run_neighborhood_enrichment(adata) # Example integration with SquiPy + so_hd.save_adata(adata) diff --git a/src/enact/pipeline.py b/src/enact/pipeline.py new file mode 100644 index 0000000..6cb7e50 --- /dev/null +++ b/src/enact/pipeline.py @@ -0,0 +1,942 @@ +"""Class for defining methods for VisiumHD pipeline +""" + +import os +from csbdeep.utils import normalize +import geopandas as gpd +import numpy as np +import pandas as pd +from PIL import Image +import scanpy as sc +from scipy import sparse +import shapely +from shapely.geometry import Polygon, Point +from shapely import wkt +from stardist.models import StarDist2D +import tifffile as tifi +from tqdm import tqdm +import yaml +import anndata +import scvi +import seaborn as sns +from scvi.external import CellAssign +import logging +import ssl +import argparse + +logger = logging.getLogger(__name__) +logging.basicConfig(filename="ENACT.log", level=logging.INFO) + +Image.MAX_IMAGE_PIXELS = None +from .assignment_methods.naive import naive_assignment +from .assignment_methods.weight_by_area import weight_by_area_assignment +from .assignment_methods.weight_by_gene import ( + weight_by_gene_assignment, + weight_by_cluster_assignment, +) + + +class ENACT: + """Class for methods for the ENACT pipeline""" + + def __init__(self, configs): + """Inputs: + + Args: + wsi_path (str): whole slide image path + """ + self.configs = configs + self.load_configs() + + def load_configs(self): + """Loading the configuations and parameters""" + self.analysis_name = self.configs.get("analysis_name", "enact_demo") + if not self.configs.get("cache_dir"): + raise ValueError(f"Error: Please specify the 'cache_dir'") + cache_dir = self.configs["cache_dir"] + self.cache_dir = os.path.join(cache_dir, self.analysis_name) + + # Load input files + core_paths = ["wsi_path", "visiumhd_h5_path", "tissue_positions_path"] + for core_path in core_paths: + if not self.configs.get("paths") or not self.configs["paths"].get( + core_path + ): + raise ValueError( + f"Error: '{core_path}' is required in 'paths' configuration." + ) + self.wsi_path = self.configs["paths"]["wsi_path"] + self.visiumhd_h5_path = self.configs["paths"]["visiumhd_h5_path"] + self.tissue_positions_path = self.configs["paths"]["tissue_positions_path"] + + # Load parameters + parameters = self.configs.get("params", {}) + self.seg_method = parameters.get("seg_method", "stardist") + self.patch_size = parameters.get( + "patch_size", 4000 + ) # Will break down the segmented cells file into chunks of this size to fit into memory + self.n_clusters = parameters.get("n_clusters", 4) + self.bin_representation = parameters.get("bin_representation", "polygon") + self.bin_to_cell_method = parameters.get( + "bin_to_cell_method", "weighted_by_cluster" + ) + self.cell_annotation_method = parameters.get( + "cell_annotation_method", "celltypist" + ) + if self.cell_annotation_method == "celltypist": + if not parameters.get("cell_typist_model"): + raise ValueError( + f"Error: '{cell_typist_model}' is required in 'params' configuration." + ) + self.cell_typist_model = self.configs["params"]["cell_typist_model"] + self.run_synthetic = self.configs.get("run_synthetic", False) + + # Load steps + steps = self.configs.get("steps", {}) + self.segmentation = steps.get("segmentation", True) + self.bin_to_geodataframes = steps.get("bin_to_geodataframes", True) + self.bin_to_cell_assignment = steps.get("bin_to_cell_assignment", True) + self.cell_type_annotation = steps.get("cell_type_annotation", True) + + # Generating paths + self.cells_df_path = os.path.join(self.cache_dir, "cells_df.csv") + self.cells_layer_path = os.path.join(self.cache_dir, "cells_layer.png") + self.cell_chunks_dir = os.path.join(self.cache_dir, "chunks", "cells_gdf") + self.bin_chunks_dir = os.path.join(self.cache_dir, "chunks", "bins_gdf") + self.bin_assign_dir = os.path.join( + self.cache_dir, "chunks", self.bin_to_cell_method, "bin_to_cell_assign" + ) + self.cell_ix_lookup_dir = os.path.join( + self.cache_dir, "chunks", self.bin_to_cell_method, "cell_ix_lookup" + ) + os.makedirs(self.cache_dir, exist_ok=True) + self.cellannotation_results_dir = os.path.join( + self.cache_dir, + "chunks", + self.bin_to_cell_method, + f"{self.cell_annotation_method}_results", + ) + os.makedirs(self.cellannotation_results_dir, exist_ok=True) + os.makedirs(self.cell_chunks_dir, exist_ok=True) + + def load_image(self, file_path=None): + """Load image from given file path + Arguments: + file_path {string} : path to the file that we are trying to load + Returns: + np.array -- loaded image as numpy array + """ + if file_path == None: + file_path = self.wsi_path + img_arr = tifi.imread(file_path) + crop_bounds = self.get_image_crop_bounds() + x_min, y_min, x_max, y_max = crop_bounds + img_arr = img_arr[y_min:y_max, x_min:x_max, :] + logger.info(" Successfully loaded image!") + return img_arr, crop_bounds + + def get_image_crop_bounds(self, file_path=None): + """Get the crop location of the image to adjust the coordinates accordingly + + Args: + file_path (_type_): _description_ + + Returns: + _type_: _description_ + """ + if file_path == None: + file_path = self.wsi_path + tissue_pos_list = pd.read_parquet(self.tissue_positions_path) + + # Cleaning up, removing negative coords,removing out of tissue bins + tissue_pos_list_filt = tissue_pos_list[tissue_pos_list.in_tissue == 1] + tissue_pos_list_filt["pxl_row_in_fullres"] = tissue_pos_list_filt[ + "pxl_row_in_fullres" + ].astype(int) + tissue_pos_list_filt["pxl_col_in_fullres"] = tissue_pos_list_filt[ + "pxl_col_in_fullres" + ].astype(int) + tissue_pos_list_filt = tissue_pos_list_filt.loc[ + (tissue_pos_list_filt.pxl_row_in_fullres >= 0) + & (tissue_pos_list_filt.pxl_col_in_fullres >= 0) + ] + x_min = tissue_pos_list_filt["pxl_col_in_fullres"].min() + y_min = tissue_pos_list_filt["pxl_row_in_fullres"].min() + x_max = tissue_pos_list_filt["pxl_col_in_fullres"].max() + y_max = tissue_pos_list_filt["pxl_row_in_fullres"].max() + return (x_min, y_min, x_max, y_max) + + def normalize_image(self, image, min_percentile=5, max_percentile=95): + """_summary_ + + Args: + image (_type_): _description_ + min_percentile (int, optional): _description_. Defaults to 5. + max_percentile (int, optional): _description_. Defaults to 95. + + Returns: + _type_: _description_ + """ + # Adjust min_percentile and max_percentile as needed + image_norm = normalize(image, min_percentile, max_percentile) + logger.info(" Successfully normalized image!") + return image_norm + + def segment_cells(self, image, prob_thresh=0.005): + """_summary_ + + Args: + image (_type_): _description_ + prob_thresh (float, optional): _description_. Defaults to 0.005. + + Returns: + _type_: _description_ + """ + if self.seg_method == "stardist": + # Adjust nms_thresh and prob_thresh as needed + # ssl._create_default_https_context = ssl._create_unverified_context + self.stardist_model = StarDist2D.from_pretrained("2D_versatile_he") + labels, polys = self.stardist_model.predict_instances_big( + image, + axes="YXC", + block_size=4096, + prob_thresh=prob_thresh, + nms_thresh=0.001, + min_overlap=128, + context=128, + normalizer=None, + n_tiles=(4, 4, 1), + ) + logger.info(" Successfully segmented cells!") + return labels, polys + else: + logger.warning(" Invalid cell segmentation model!") + return None, None + + def convert_stardist_output_to_gdf(self, cell_polys, save_path=None): + """Convert stardist output to geopandas dataframe + + Args: + cell_polys (_type_): _description_ + save_path (_type_, optional): _description_. Defaults to None. + """ + if save_path == None: + save_path = self.cells_df_path + # Creating a list to store Polygon geometries + geometries = [] + centroids = [] + cell_x, cell_y = [], [] + + # Iterating through each nuclei in the 'polys' DataFrame + for nuclei in range(len(cell_polys["coord"])): + # Extracting coordinates for the current nuclei and converting them to (y, x) format + coords = [ + (y, x) + for x, y in zip( + cell_polys["coord"][nuclei][0], cell_polys["coord"][nuclei][1] + ) + ] + # Creating a Polygon geometry from the coordinates + poly = Polygon(coords) + centroid = list(poly.centroid.coords)[0] + centroids.append(centroid) + geometries.append(poly) + cell_x.append(centroid[0]) + cell_y.append(centroid[1]) + + # Creating a GeoDataFrame using the Polygon geometries + gdf = gpd.GeoDataFrame(geometry=geometries) + gdf["id"] = [f"ID_{i+1}" for i, _ in enumerate(gdf.index)] + gdf["cell_x"] = cell_x + gdf["cell_y"] = cell_y + gdf["centroid"] = centroids + # Save results to disk + gdf.to_csv(save_path) + return gdf + + def split_df_to_chunks(self, df, x_col, y_col, output_dir): + """ + Break the cells df into files, of size patch_size x patch_size + + Args: + df (_type_): _description_ + """ + os.makedirs(output_dir, exist_ok=True) + i = 0 + # Need to divide into chunks of patch_size pixels by patch_size pixels + df[["patch_x", "patch_y"]] = (df[[x_col, y_col]] / self.patch_size).astype(int) + df["patch_id"] = df["patch_x"].astype(str) + "_" + df["patch_y"].astype(str) + logger.info( + f" Splitting into chunks. output_dir: {output_dir}" + ) + unique_patches = df.patch_id.unique() + for patch_id in tqdm(unique_patches, total=len(unique_patches)): + patch_cells = df[df.patch_id == patch_id] + if len(patch_cells) == 0: + continue + patch_cells.to_csv(os.path.join(output_dir, f"patch_{patch_id}.csv")) + + def load_visiumhd_dataset(self, crop_bounds): + """Loads the VisiumHD dataset and adjusts the + coordinates to the cropped image + + Args: + crop_bounds (tuple): crop bounds + + Returns: + AnnData: AnnData object with the VisiumHD data + int: bin size in pixels + """ + + # Accounting for crop bounds + if crop_bounds is not None: + x1, y1, _, _ = crop_bounds + else: + x1, y1 = (0, 0) + # Load Visium HD data + adata = sc.read_10x_h5(self.visiumhd_h5_path) + + # Load the Spatial Coordinates + df_tissue_positions = pd.read_parquet(self.tissue_positions_path) + + # Set the index of the dataframe to the barcodes + df_tissue_positions = df_tissue_positions.set_index("barcode") + + # Create an index in the dataframe to check joins + df_tissue_positions["index"] = df_tissue_positions.index + + # *Important step*: Representing coords in the cropped WSI frame + df_tissue_positions["pxl_row_in_fullres"] = ( + df_tissue_positions["pxl_row_in_fullres"] - y1 + ) + df_tissue_positions["pxl_col_in_fullres"] = ( + df_tissue_positions["pxl_col_in_fullres"] - x1 + ) + # Adding the tissue positions to the meta data + adata.obs = pd.merge( + adata.obs, df_tissue_positions, left_index=True, right_index=True + ) + + first_row = df_tissue_positions[ + (df_tissue_positions["array_row"] == 0) + & (df_tissue_positions["array_col"] == 0) + ]["pxl_col_in_fullres"] + second_row = df_tissue_positions[ + (df_tissue_positions["array_row"] == 0) + & (df_tissue_positions["array_col"] == 1) + ]["pxl_col_in_fullres"] + bin_size = second_row[0] - first_row[0] + if self.configs["params"]["use_hvg"]: + # Keeping the top n highly variable genes + the user requested cell markers + n_genes = self.configs["params"]["n_hvg"] + # Normalizing to median total counts + adata_norm = adata.copy() + sc.pp.normalize_total(adata_norm) + # Logarithmize the data + sc.pp.log1p(adata_norm) + sc.pp.highly_variable_genes(adata_norm, n_top_genes=n_genes) + + hvg_mask = adata_norm.var["highly_variable"] + cell_markers = [ + item + for sublist in self.configs["cell_markers"].values() + for item in sublist + ] + missing_markers = set(cell_markers) - set(hvg_mask.index) + logger.info( + f" Missing the following markers: {missing_markers}" + ) + available_markers = list(set(cell_markers) & set(hvg_mask.index)) + hvg_mask.loc[available_markers] = True + adata = adata[:, hvg_mask] + + return adata, bin_size + + def generate_bin_polys(self, bins_df, x_col, y_col, bin_size): + """Represents the bins as Shapely polygons + + Args: + bins_df (pd.DataFrame): bins dataframe + x_col (str): column with the bin centre x-coordinate + y_col (str): column with the bin centre y-coordinate + bin_size (int): bin size in pixels + + Returns: + list: list of Shapely polygons + """ + geometry = [] + # Generates Shapely polygons to represent each bin + if self.bin_representation == "point": + # Geometry column is just the centre (x, y) for a VisiumHD bin + geometry = [Point(xy) for xy in zip(bins_df[x_col], bins_df[y_col])] + elif self.bin_representation == "polygon": + logger.info( + f" Generating bin polygons. num_bins: {len(bins_df)}" + ) + half_bin_size = bin_size / 2 + bbox_coords = pd.DataFrame( + { + "min_x": bins_df[x_col] - half_bin_size, + "min_y": bins_df[y_col] - half_bin_size, + "max_x": bins_df[x_col] + half_bin_size, + "max_y": bins_df[y_col] + half_bin_size, + } + ) + geometry = [ + shapely.geometry.box(min_x, min_y, max_x, max_y) + for min_x, min_y, max_x, max_y in tqdm( + zip( + bbox_coords["min_x"], + bbox_coords["min_y"], + bbox_coords["max_x"], + bbox_coords["max_y"], + ), + total=len(bins_df), + ) + ] + else: + logger.warning(" Select a valid mode!") + return geometry + + def convert_adata_to_cell_by_gene(self, adata): + """Converts the AnnData object from bin-by-gene to + cell-by-gene AnnData object. + + Args: + adata (AnnData): bin-by-gene AnnData + + Returns: + AnnData: cell-by-gene AnnData + """ + # Group the data by unique cell IDs + groupby_object = adata.obs.groupby(["id"], observed=True) + + # Extract the gene expression counts from the AnnData object + counts = adata.X.tocsr() + + # Obtain the number of unique nuclei and the number of genes in the expression data + N_groups = groupby_object.ngroups + N_genes = counts.shape[1] + + # Initialize a sparse matrix to store the summed gene counts for each nucleus + summed_counts = sparse.lil_matrix((N_groups, N_genes)) + + # Lists to store the IDs of polygons and the current row index + polygon_id = [] + row = 0 + # Iterate over each unique polygon to calculate the sum of gene counts. + for polygons, idx_ in groupby_object.indices.items(): + summed_counts[row] = counts[idx_].sum(0) + row += 1 + polygon_id.append(polygons) + # Create an AnnData object from the summed count matrix + summed_counts = summed_counts.tocsr() + cell_by_gene_adata = anndata.AnnData( + X=summed_counts, + obs=pd.DataFrame(polygon_id, columns=["id"], index=polygon_id), + var=adata.var, + ) + return cell_by_gene_adata + + def generate_bins_gdf(self, adata, bin_size): + """Convert the bins Anndata object to a geodataframe + + Args: + adata (_type_): _description_ + + Returns: + _type_: _description_ + """ + bin_coords_df = adata.obs.copy() + geometry = self.generate_bin_polys( + bins_df=bin_coords_df, + x_col="pxl_col_in_fullres", + y_col="pxl_row_in_fullres", + bin_size=bin_size, + ) + bins_gdf = gpd.GeoDataFrame(bin_coords_df, geometry=geometry) + return bins_gdf + + def assign_bins_to_cells(self, adata): + """Assigns bins to cells based on method requested by the user + + Args: + adata (_type_): _description_ + """ + os.makedirs(self.bin_assign_dir, exist_ok=True) + os.makedirs(self.cell_ix_lookup_dir, exist_ok=True) + if self.configs["params"]["chunks_to_run"]: + chunk_list = self.configs["params"]["chunks_to_run"] + else: + chunk_list = os.listdir(self.cell_chunks_dir) + logger.info( + f" Assigning bins to cells using {self.bin_to_cell_method} method" + ) + for chunk in tqdm(chunk_list, total=len(chunk_list)): + if os.path.exists(os.path.join(self.cell_ix_lookup_dir, chunk)): + continue + # Loading the cells geodataframe + cell_gdf_chunk_path = os.path.join(self.cell_chunks_dir, chunk) + cell_gdf_chunk = gpd.GeoDataFrame(pd.read_csv(cell_gdf_chunk_path)) + cell_gdf_chunk["geometry"] = cell_gdf_chunk["geometry"].apply(wkt.loads) + cell_gdf_chunk.set_geometry("geometry", inplace=True) + # Loading the bins geodataframe + bin_gdf_chunk_path = os.path.join(self.bin_chunks_dir, chunk) + bin_gdf_chunk = gpd.GeoDataFrame(pd.read_csv(bin_gdf_chunk_path)) + bin_gdf_chunk["geometry"] = bin_gdf_chunk["geometry"].apply(wkt.loads) + bin_gdf_chunk.set_geometry("geometry", inplace=True) + + # Perform a spatial join to check which coordinates are in a cell nucleus + result_spatial_join = gpd.sjoin( + bin_gdf_chunk, + cell_gdf_chunk[["geometry", "id", "cell_x", "cell_y"]], + how="left", + predicate="intersects", + ) + + # Only keeping the bins that overlap with a cell + result_spatial_join = result_spatial_join[ + ~result_spatial_join["index_right"].isna() + ] + + # Getting unique bins and overlapping bins + barcodes_in_overlaping_polygons = pd.unique( + result_spatial_join[result_spatial_join.duplicated(subset=["index"])][ + "index" + ] + ) + result_spatial_join["unique_bin"] = ~result_spatial_join["index"].isin( + barcodes_in_overlaping_polygons + ) + # Filter the adata object to contain only the barcodes in result_spatial_join + # shape: (#bins_overlap x #genes) + expanded_adata = adata[result_spatial_join["index"]] + # Adding the cell ids to the anndata object (the cell that the bin is assigned to) + # Can have duplicate bins (i.e. "expanded") if a bin is assigned to more than one cell + expanded_adata.obs["id"] = result_spatial_join["id"].tolist() + + # Reshape the anndata object to (#cells x #genes) + filtered_result_spatial_join = result_spatial_join[ + result_spatial_join["unique_bin"] + ] + filtered_adata = adata[filtered_result_spatial_join["index"]] + filtered_adata.obs["id"] = filtered_result_spatial_join["id"].tolist() + + unfilt_result_spatial_join = result_spatial_join.copy() + logger.info(" done spatial join") + + if self.bin_to_cell_method == "naive": + result_spatial_join = naive_assignment(result_spatial_join) + expanded_adata = filtered_adata.copy() + + elif self.bin_to_cell_method == "weighted_by_area": + result_spatial_join, expanded_adata = weight_by_area_assignment( + result_spatial_join, expanded_adata, cell_gdf_chunk + ) + + elif self.bin_to_cell_method == "weighted_by_gene": + unique_cell_by_gene_adata = self.convert_adata_to_cell_by_gene( + filtered_adata + ) + result_spatial_join, expanded_adata = weight_by_gene_assignment( + result_spatial_join, expanded_adata, unique_cell_by_gene_adata + ) + + elif self.bin_to_cell_method == "weighted_by_cluster": + unique_cell_by_gene_adata = self.convert_adata_to_cell_by_gene( + filtered_adata + ) + result_spatial_join, expanded_adata = weight_by_cluster_assignment( + result_spatial_join, + expanded_adata, + unique_cell_by_gene_adata, + n_clusters=self.configs["params"]["n_clusters"], + ) + else: + print("ERROR", self.bin_to_cell_method) + logger.info(" convert_adata_to_cell_by_gene") + cell_by_gene_adata = self.convert_adata_to_cell_by_gene(expanded_adata) + del expanded_adata + + # Save the gene to cell assignment results to a .csv file + chunk_gene_to_cell_assign_df = pd.DataFrame( + cell_by_gene_adata.X.toarray(), + columns=cell_by_gene_adata.var_names, + ) + + chunk_gene_to_cell_assign_df = chunk_gene_to_cell_assign_df.loc[ + :, ~chunk_gene_to_cell_assign_df.columns.duplicated() + ].copy() + # Saving counts to cache + chunk_gene_to_cell_assign_df.to_csv( + os.path.join(self.bin_assign_dir, chunk) + ) + + # Getting number of bins shared between cells + overlaps_df = ( + unfilt_result_spatial_join.groupby(["id", "unique_bin"]) + .count()["in_tissue"] + .reset_index() + ) + overlaps_df = overlaps_df.pivot( + index="id", columns="unique_bin", values="in_tissue" + ).fillna(0) + try: + overlaps_df.columns = ["num_shared_bins", "num_unique_bins"] + except: + overlaps_df.columns = ["num_unique_bins"] + overlaps_df["num_shared_bins"] = 0 + cell_gdf_chunk = cell_gdf_chunk.merge( + overlaps_df, how="left", left_on="id", right_index=True + ) + cell_gdf_chunk[["num_shared_bins", "num_unique_bins"]] = cell_gdf_chunk[ + ["num_shared_bins", "num_unique_bins"] + ].fillna(0) + # Save index lookup to store x and y values and cell index + index_lookup_df = cell_by_gene_adata.obs.merge( + cell_gdf_chunk, how="left", left_index=True, right_on="id" + )[ + ["cell_x", "cell_y", "num_shared_bins", "num_unique_bins", "id"] + ].reset_index( + drop=True + ) + index_lookup_df["num_transcripts"] = chunk_gene_to_cell_assign_df.sum( + axis=1 + ) + index_lookup_df["chunk_name"] = chunk + index_lookup_df.to_csv(os.path.join(self.cell_ix_lookup_dir, chunk)) + print( + f"{self.bin_to_cell_method} mean count per cell: {chunk_gene_to_cell_assign_df.sum(axis=1).mean()}" + ) + + def assign_bins_to_cells_synthetic(self): + """Assigns bins to cells based on method requested by the user + + Args: + adata (_type_): _description_ + """ + os.makedirs(self.bin_assign_dir, exist_ok=True) + os.makedirs(self.cell_ix_lookup_dir, exist_ok=True) + if self.configs["params"]["chunks_to_run"]: + chunk_list = self.configs["params"]["chunks_to_run"] + else: + chunk_list = os.listdir(self.cell_chunks_dir) + + logger.info( + f" Assigning bins to cells using {self.bin_to_cell_method} method" + ) + for chunk in tqdm(chunk_list, total=len(chunk_list)): + if os.path.exists(os.path.join(self.cell_ix_lookup_dir, chunk)): + continue + if chunk in [".ipynb_checkpoints"]: + continue + # Loading the cells geodataframe + cell_gdf_chunk_path = os.path.join(self.cell_chunks_dir, chunk) + cell_gdf_chunk = gpd.GeoDataFrame(pd.read_csv(cell_gdf_chunk_path)) + cell_gdf_chunk["geometry"] = cell_gdf_chunk["geometry"].apply(wkt.loads) + cell_gdf_chunk.set_geometry("geometry", inplace=True) + cell_gdf_chunk["geometry"] = cell_gdf_chunk["geometry"].buffer(0) + # Loading the bins geodataframe + bin_gdf_chunk_path = os.path.join(self.bin_chunks_dir, chunk) + bin_gdf_chunk = gpd.GeoDataFrame(pd.read_csv(bin_gdf_chunk_path)) + bin_gdf_chunk["geometry"] = bin_gdf_chunk["geometry"].apply(wkt.loads) + bin_gdf_chunk.set_geometry("geometry", inplace=True) + + # Perform a spatial join to check which coordinates are in a cell nucleus + result_spatial_join = gpd.sjoin( + bin_gdf_chunk[["geometry", "assigned_bin_id", "row", "column"]], + cell_gdf_chunk[["geometry", "cell_id"]], + how="left", + predicate="intersects", + ) + + # Only keeping the bins that overlap with a cell + result_spatial_join = result_spatial_join[ + ~result_spatial_join["index_right"].isna() + ] + + # Getting unique bins and overlapping bins + barcodes_in_overlaping_polygons = pd.unique( + result_spatial_join[ + result_spatial_join.duplicated(subset=["assigned_bin_id"]) + ]["assigned_bin_id"] + ) + result_spatial_join["unique_bin"] = ~result_spatial_join[ + "assigned_bin_id" + ].isin(barcodes_in_overlaping_polygons) + bin_gdf_chunk = bin_gdf_chunk.set_index("assigned_bin_id") + adata = anndata.AnnData( + bin_gdf_chunk.drop(columns=["geometry", "row", "column"]) + ) + + # Filter the adata object to contain only the barcodes in result_spatial_join + # shape: (#bins_overlap x #genes) + expanded_adata = adata[result_spatial_join["assigned_bin_id"]] + # Adding the cell ids to the anndata object (the cell that the bin is assigned to) + # Can have duplicate bins (i.e. "expanded") if a bin is assigned to more than one cell + expanded_adata.obs["id"] = result_spatial_join["cell_id"].tolist() + expanded_adata.obs["index"] = result_spatial_join[ + "assigned_bin_id" + ].tolist() + + # Reshape the anndata object to (#cells x #genes) + filtered_result_spatial_join = result_spatial_join[ + result_spatial_join["unique_bin"] + ] + filtered_adata = adata[filtered_result_spatial_join["assigned_bin_id"]] + filtered_adata.obs["id"] = filtered_result_spatial_join["cell_id"].tolist() + if not sparse.issparse(filtered_adata.X): + filtered_adata.X = sparse.csr_matrix(filtered_adata.X) + if not sparse.issparse(expanded_adata.X): + expanded_adata.X = sparse.csr_matrix(expanded_adata.X) + logger.info(" done spatial join") + + cell_gdf_chunk.rename(columns={"cell_id": "id"}, inplace=True) + result_spatial_join.rename( + columns={"assigned_bin_id": "index"}, inplace=True + ) + result_spatial_join.rename(columns={"cell_id": "id"}, inplace=True) + if self.bin_to_cell_method == "naive": + result_spatial_join = naive_assignment(result_spatial_join) + expanded_adata = filtered_adata.copy() + + elif self.bin_to_cell_method == "weighted_by_area": + # cell_gdf_chunk = cell_gdf_chunk.set_index('index_right') + result_spatial_join, expanded_adata = weight_by_area_assignment( + result_spatial_join, expanded_adata, cell_gdf_chunk + ) + + elif self.bin_to_cell_method == "weighted_by_gene": + unique_cell_by_gene_adata = self.convert_adata_to_cell_by_gene( + filtered_adata + ) + result_spatial_join, expanded_adata = weight_by_gene_assignment( + result_spatial_join, expanded_adata, unique_cell_by_gene_adata + ) + + elif self.bin_to_cell_method == "weighted_by_cluster": + unique_cell_by_gene_adata = self.convert_adata_to_cell_by_gene( + filtered_adata + ) + result_spatial_join, expanded_adata = weight_by_cluster_assignment( + result_spatial_join, + expanded_adata, + unique_cell_by_gene_adata, + n_clusters=self.configs["params"]["n_clusters"], + ) + + logger.info(" convert_adata_to_cell_by_gene") + + if not sparse.issparse(expanded_adata.X): + expanded_adata.X = sparse.csr_matrix(expanded_adata.X) + cell_by_gene_adata = self.convert_adata_to_cell_by_gene(expanded_adata) + del expanded_adata + + # Save the gene to cell assignment results to a .csv file + chunk_gene_to_cell_assign_df = pd.DataFrame( + cell_by_gene_adata.X.toarray(), + columns=cell_by_gene_adata.var_names, + ) + chunk_gene_to_cell_assign_df.insert( + 0, "id", cell_by_gene_adata.obs["id"].values + ) + + chunk_gene_to_cell_assign_df = chunk_gene_to_cell_assign_df.loc[ + :, ~chunk_gene_to_cell_assign_df.columns.duplicated() + ].copy() + + # Saving counts to cache + chunk_gene_to_cell_assign_df.to_csv( + os.path.join(self.bin_assign_dir, chunk) + ) + + print(f"{chunk} finished") + + def merge_files( + self, input_folder, output_file_name="merged_results.csv", save=True + ): + """Merges all files in a specified input folder into a single output file. + + Args: + input_folder (str): The path to the folder containing the input files to be merged. + output_file_name (str): The name of the output file. + """ + # List to store the DataFrames + csv_list = [] + output_file = os.path.join(input_folder, output_file_name) + + if self.configs["params"]["chunks_to_run"]: + chunk_list = self.configs["params"]["chunks_to_run"] + else: + chunk_list = os.listdir(self.cell_chunks_dir) + # Loop through all files in the directory + for filename in chunk_list: + if filename in ["annotated.csv", ".ipynb_checkpoints"]: + continue + if "merged" in filename: + continue + + # Read each .csv file and append it to the list + file_path = os.path.join(input_folder, filename) + df = pd.read_csv(file_path) + csv_list.append(df) + + # Concatenate all DataFrames in the list + concatenated_df = pd.concat(csv_list, ignore_index=True) + + if save: + # Save the concatenated DataFrame to the output file + concatenated_df.to_csv(output_file, index=False) + logger.info( + f" files have been merged and saved to {output_file}" + ) + return concatenated_df + + def run_cell_type_annotation(self): + """Runs cell type annotation""" + ann_method = self.configs["params"]["cell_annotation_method"] + if ann_method == "sargent": + logger.info( + f" Will launch Sargent separately. " + "Please ensure Sargent is installed." + ) + elif ann_method == "cellassign": + from .cellassign import CellAssignPipeline + + cellassign_obj = CellAssignPipeline(configs=self.configs) + cellassign_obj.format_markers_to_df() + cellassign_obj.run_cell_assign() + logger.info( + f" Successfully ran CellAssign on Data." + ) + + elif ann_method == "celltypist": + from .celltypist import CellTypistPipeline + + celltypist_obj = CellTypistPipeline(configs=self.configs) + + celltypist_obj.run_cell_typist() + logger.info( + f" Successfully ran CellTypist on Data." + ) + else: + logger.info( + " Please select a valid cell annotation " + "method. options=['cellassign', 'sargent']" + ) + + def package_results(self): + """Packages the results of the pipeline""" + from .package_results import PackageResults + + pack_obj = PackageResults(configs=self.configs) + ann_method = self.configs["params"]["cell_annotation_method"] + if ann_method == "sargent": + results_df, cell_by_gene_df = pack_obj.merge_sargent_output_files() + adata = pack_obj.df_to_adata(results_df, cell_by_gene_df) + pack_obj.save_adata(adata) + logger.info(" Packaged Sargent results") + elif ann_method == "cellassign": + cell_by_gene_df = pack_obj.merge_cellassign_output_files() + results_df = pd.read_csv( + os.path.join(self.cellannotation_results_dir, "merged_results.csv") + ) + adata = pack_obj.df_to_adata(results_df, cell_by_gene_df) + pack_obj.save_adata(adata) + logger.info(" Packaged CellAssign results") + elif ann_method == "celltypist": + cell_by_gene_df = pack_obj.merge_cellassign_output_files() + results_df = pd.read_csv( + os.path.join(self.cellannotation_results_dir, "merged_results.csv") + ) + adata = pack_obj.df_to_adata(results_df, cell_by_gene_df) + pack_obj.save_adata(adata) + logger.info(" Packaged CellTypist results") + else: + logger.info( + f" Please select a valid cell annotation method" + ) + + def run_enact(self): + """Runs ENACT given the user-specified configs""" + if not self.run_synthetic: + # Loading image and getting shape and cropping boundaries (if applicable) + wsi, crop_bounds = self.load_image() + + # Run cell segmentation + if self.segmentation: + wsi_norm = self.normalize_image( + image=wsi, min_percentile=5, max_percentile=95 + ) + cell_labels, cell_polys = self.segment_cells(image=wsi_norm) + cells_gdf = self.convert_stardist_output_to_gdf( + cell_polys=cell_polys, save_path=None + ) + # cells_gdf = pd.read_csv("/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon-demo/enact_results/cells_df.csv") + # cells_gdf["geometry"] = cells_gdf["geometry"].apply(wkt.loads) + # Split the cells geodataframe to chunks + self.split_df_to_chunks( + df=cells_gdf, + x_col="cell_x", + y_col="cell_y", + output_dir=self.cell_chunks_dir, + ) + del cells_gdf, wsi # Clearing memory + else: + # Load the segmentation results from cache + cells_gdf = pd.read_csv(self.cells_df_path) + cells_gdf["geometry"] = cells_gdf["geometry"].apply(wkt.loads) + + # Loading the VisiumHD reads + if self.bin_to_geodataframes: + bins_adata, bin_size = self.load_visiumhd_dataset(crop_bounds) + # Convert VisiumHD reads to geodataframe objects + bins_gdf = self.generate_bins_gdf(bins_adata, bin_size) + # Splitting the bins geodataframe object + self.split_df_to_chunks( + df=bins_gdf, + x_col="pxl_col_in_fullres", + y_col="pxl_row_in_fullres", + output_dir=self.bin_chunks_dir, + ) + del bins_gdf + + # Run bin-to-cell assignment + if self.bin_to_cell_assignment: + bins_adata, bin_size = self.load_visiumhd_dataset(crop_bounds) + self.assign_bins_to_cells(bins_adata) + + # Run cell type annotation + if self.cell_type_annotation: + self.run_cell_type_annotation() + self.package_results() + + else: + # Generating synthetic data + if self.analysis_name in ["xenium", "xenium_nuclei"]: + cells_gdf = pd.read_csv(self.cells_df_path) + self.split_df_to_chunks( + df=cells_gdf, + x_col="cell_x", + y_col="cell_y", + output_dir=self.cell_chunks_dir, + ) + self.assign_bins_to_cells_synthetic() + + +if __name__ == "__main__": + # Creating ENACT object + parser = argparse.ArgumentParser(description="Specify ENACT config file location.") + parser.add_argument( + "--configs_path", type=str, required=False, help="Config file location" + ) + args = parser.parse_args() + if not args.configs_path: + configs_path = "config/configs.yaml" + else: + configs_path = args.configs_path + print(f" Loading configurations from {configs_path}") + with open(configs_path, "r") as stream: + configs = yaml.safe_load(stream) + so_hd = ENACT(configs) + so_hd.run_enact() diff --git a/src/eval/cell_annotation_eval.py b/src/eval/cell_annotation_eval.py new file mode 100644 index 0000000..4356132 --- /dev/null +++ b/src/eval/cell_annotation_eval.py @@ -0,0 +1,196 @@ +# Script runs the evaluation to compare ENACT cell annotations versus pathologist cell annotations + +from shapely.geometry import shape +import plotly.express as px +import geopandas as gpd +import json +from shapely.geometry import Polygon, Point +from shapely import wkt +import pandas as pd +from sklearn.metrics import precision_recall_fscore_support, accuracy_score +from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay +import os + +# from src.pipelines.enact_pipeline import ENACT + +# so_hd = ENACT(configs_path="config/configs.yaml") + +geojson_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Human_Colon_Cancer-wsi-40598_0_65263_22706.geojson" +segmentation_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/cells_df.csv" +predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/cellassign_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/sargent_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/cellassign_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/sargent_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/sargent_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/sargent_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/cellassign_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/cellassign_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/celltypist_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/celltypist_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/celltypist_results/merged_results.csv" +# predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/celltypist_results/merged_results.csv" + + +results_eval_dir = os.path.join("/".join(predictions_df_path.split("/")[:-1]), "eval") +os.makedirs(results_eval_dir, exist_ok=True) + + +name_map = { + 'unclassified': "no label", + 'Immune': "immune cells", + 'Crypt cells': "epithelial cells", + 'Enterocytes': "epithelial cells", + 'Epithelial': "epithelial cells", + 'Smooth muscle cell': "stromal cells", + 'Fibroblast': "stromal cells", + 'Endothelial': "stromal cells", + 'Paneth cells': "epithelial cells", + 'Enteroendocrine cells': "epithelial cells", + 'Goblet cells': "epithelial cells", + 'Neuronal': "stromal cells", + 'ephitelial cells': "epithelial cells", + 'no label': "no label", + "Ignore*": "no label", + "B cells": "immune cells", + "T cells": "immune cells", + "NK cells": "immune cells", + "Macrophages": "immune cells", + "Neutrophils": "immune cells", + "Eosinophils": "immune cells", + 'CD19+CD20+ B': "immune cells", # B cells are immune cells + 'CD4+ T cells': "immune cells", # CD4+ T cells are immune cells + 'CD8+ T cells': "immune cells", # CD8+ T cells are immune cells + 'CMS1': "epithelial cells", # CMS (Consensus Molecular Subtypes) refer to tumor/epithelial cells + 'CMS2': "epithelial cells", # Same as above + 'CMS3': "epithelial cells", # Same as above + 'CMS4': "epithelial cells", # Same as above + 'Enteric glial cells': "stromal cells", # Glial cells are part of the stromal tissue + 'Goblet cells': "epithelial cells", # Goblet cells are epithelial cells + 'IgA+ Plasma': "immune cells", # Plasma cells are immune cells (B-cell derivatives) + 'IgG+ Plasma': "immune cells", # Same as above + 'Intermediate': "no label", # Ambiguous, no clear label + 'Lymphatic ECs': "stromal cells", # Endothelial cells are considered stromal + 'Mast cells': "immune cells", # Mast cells are immune cells + 'Mature Enterocytes type 1': "epithelial cells", # Enterocytes are epithelial cells + 'Mature Enterocytes type 2': "epithelial cells", # Same as above + 'Myofibroblasts': "stromal cells", # Fibroblasts are stromal cells + 'NK cells': "immune cells", # NK cells are immune cells + 'Pericytes': "stromal cells", # Pericytes are part of the vasculature (stromal) + 'Pro-inflammatory': "immune cells", # Inflammation implies immune function + 'Proliferating': "no label", # Too vague to classify, no label + 'Proliferative ECs': "stromal cells", # Endothelial cells are stromal + 'Regulatory T cells': "immune cells", # T cells are immune cells + 'SPP1+': "no label", # Ambiguous, no clear label + 'Smooth muscle cells': "stromal cells", # Smooth muscle cells are stromal cells + 'Stalk-like ECs': "stromal cells", # Endothelial cells are stromal + 'Stem-like/TA': "epithelial cells", # Stem cells in this context are usually epithelial + 'Stromal 1': "stromal cells", # Explicitly stromal + 'Stromal 2': "stromal cells", # Same as above + 'Stromal 3': "stromal cells", # Same as above + 'T follicular helper cells': "immune cells", # T cells are immune cells + 'T helper 17 cells': "immune cells", # Same as above + 'Tip-like ECs': "stromal cells", # Endothelial cells are stromal + 'Unknown': "no label", # No clear label + 'cDC': "immune cells", # Conventional dendritic cells are immune cells + 'gamma delta T cells': "immune cells" # T cells are immune cells +} + + +segmentation_df = pd.read_csv(segmentation_df_path) +predictions_df = pd.read_csv(predictions_df_path) +predictions_df = predictions_df.merge(segmentation_df[["id", "geometry"]], how="left", on="id") +predictions_df["geometry"] = predictions_df["geometry"].apply(wkt.loads) +pred_gpd = gpd.GeoDataFrame(predictions_df,geometry="geometry") + +def load_path_annotations(): + annotation_names = [] + annotation_geometries = [] + with open(geojson_path) as f: + regions = json.load(f) + for region in regions["features"]: + ann_type = region["properties"]["objectType"] + if ann_type == "annotation": + annotation_name = region["properties"]["classification"]["name"] + if annotation_name in ["Region*"]: + continue + annotation_geometries.append(shape(region["geometry"])) + annotation_names.append(annotation_name) + annotations_gpd = gpd.GeoDataFrame({"geometry": annotation_geometries, "gt_label": annotation_names}) + annotations_gpd["ann_ix"] = [f"ID_{i}" for i in range(len(annotations_gpd))] + return annotations_gpd + +def get_gt_annotations(annotations_gpd): + try: + cells_within_ann_gpd = gpd.sjoin(annotations_gpd, pred_gpd[["cell_type", "cell_x", "cell_y", "geometry", "id"]], how='left', predicate='intersects') + except: + cells_within_ann_gpd = gpd.sjoin(annotations_gpd, pred_gpd[["cell_assign_results", "cell_x", "cell_y", "geometry", "id"]], how='left', predicate='intersects') + cells_within_ann_gpd = cells_within_ann_gpd.drop_duplicates("ann_ix") + try: + cells_within_ann_gpd["cell_type"] = cells_within_ann_gpd["cell_type"].fillna("unclassified") + except: + cells_within_ann_gpd["cell_assign_results"] = cells_within_ann_gpd["cell_assign_results"].fillna("unclassified") + return cells_within_ann_gpd + +def validate_labels(cells_within_ann_gpd): + try: + cell_types_in_pred = set(cells_within_ann_gpd.cell_type.unique()) + except: + cell_types_in_pred = set(cells_within_ann_gpd.cell_assign_results.unique()) + print(f"Cells in pred dataset: {cell_types_in_pred}") + print (f"All cells are in the mapping!: {cell_types_in_pred.issubset(set(name_map.keys()))}") + +def relabel_cells(cells_within_ann_gpd): + # Renaming cell types + for granular_name, generic_name in name_map.items(): + cells_within_ann_gpd.loc[cells_within_ann_gpd.gt_label == granular_name, "gt_label"] = generic_name + try: + cells_within_ann_gpd.loc[cells_within_ann_gpd.cell_type == granular_name, "pred_label_clean"] = generic_name + except: + cells_within_ann_gpd.loc[cells_within_ann_gpd.cell_assign_results == granular_name, "pred_label_clean"] = generic_name + return cells_within_ann_gpd + +def eval_annotations(results_table): + cell_types = sorted(set(results_table.gt_label.unique().tolist() + results_table.pred_label_clean.unique().tolist())) + cm = confusion_matrix( + results_table.gt_label, + results_table.pred_label_clean, + labels=cell_types + ) + cm_plot = ConfusionMatrixDisplay( + confusion_matrix=cm, + display_labels=cell_types + ) + cm_plot.plot() + + averaging_methods = ["micro", "macro", "weighted"] + eval_dict = {} + for method in averaging_methods: + eval_metrics = precision_recall_fscore_support(results_table.gt_label, results_table.pred_label_clean, average=method) + precision, recall, fbeta_score, support = eval_metrics + eval_dict[method] = eval_metrics + num_correct_samples = accuracy_score(results_table.gt_label, results_table.pred_label_clean, normalize=False) + accuracy = accuracy_score(results_table.gt_label, results_table.pred_label_clean, normalize=True) + print(f"Experiment name: {predictions_df_path}") + print (f"Number of GT annotations: {len(results_table)}\nNumber of correct predictions: {num_correct_samples}\nAccuracy: {accuracy}") + print("__________") + try: + print(pd.DataFrame(results_table.cell_type.value_counts())) + except: + print(pd.DataFrame(results_table.cell_assign_results.value_counts())) + print("__________") + print(pd.DataFrame(results_table.pred_label_clean.value_counts())) + print("__________") + metrics_df = pd.DataFrame(eval_dict, index=["Precision", "Recall", "F-Score", "Support"]) + results_table.to_csv(os.path.join(results_eval_dir, "cell_annotation_eval.csv"), index=False) + metrics_df.to_csv(os.path.join(results_eval_dir, "cell_annotation_eval_metrics.csv"), index=True) + cm_plot.figure_.savefig(os.path.join(results_eval_dir, "confusion_matrix.png"),dpi=300) + print (metrics_df) + return results_table, metrics_df + +if __name__ == "__main__": + annotations_gpd = load_path_annotations() + cells_within_ann_gpd = get_gt_annotations(annotations_gpd) + validate_labels(cells_within_ann_gpd) + cells_within_ann_gpd = relabel_cells(cells_within_ann_gpd) + results_table = cells_within_ann_gpd[(cells_within_ann_gpd["gt_label"] != "no label")] + results_table, metrics_df = eval_annotations(results_table) \ No newline at end of file diff --git a/src/eval/paper_eval-cellassign-methods-highlevel.ipynb b/src/eval/paper_eval-cellassign-methods-highlevel.ipynb new file mode 100644 index 0000000..e13b28b --- /dev/null +++ b/src/eval/paper_eval-cellassign-methods-highlevel.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "8cd6edbb-e2e7-4474-9b6c-05203f97e7dc", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install shapely\n", + "# !pip install plotly\n", + "!pip install geopandas" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "74ee8420-da7a-4d00-91d9-e4273e83d21f", + "metadata": {}, + "outputs": [], + "source": [ + "from shapely.geometry import shape\n", + "import plotly.express as px\n", + "import geopandas as gpd\n", + "import json\n", + "from shapely.geometry import Polygon, Point\n", + "from shapely import wkt\n", + "import pandas as pd\n", + "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6f641cb0-1d6a-4cfb-bc35-2fa58302b28f", + "metadata": {}, + "outputs": [], + "source": [ + "# geojson_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Human_Colon_Cancer-wsi-40598_0_65263_22706-landmarks.geojson\"\n", + "geojson_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Mouse_Small_Intestine-wsi-156_4_23459_24009_all_for_one.geojson\"\n", + "\n", + "segmentation_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/stardist_cells_df.csv\"\n", + "results_eval_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/mouse_anatomical_landmark_eval\"\n", + "os.makedirs(results_eval_dir, exist_ok=True)\n", + "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+naive.csv\"\n", + "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+weighted.csv\"\n", + "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/cellassign+weighted.csv\"\n", + "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/cellassign+naive.csv\"\n", + "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+weighted-full.csv\"\n", + "predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/sargent+weighted+mouse.csv\"\n", + "\n", + "\n", + "method = predictions_df_path.split(\"/\")[-1].split(\".\")[0]\n", + "\n", + "segmentation_df = pd.read_csv(segmentation_df_path)\n", + "predictions_df = pd.read_csv(predictions_df_path)\n", + "predictions_df = predictions_df.merge(segmentation_df[[\"id\", \"geometry\"]], how=\"left\", on=\"id\")\n", + "predictions_df = predictions_df[~predictions_df.geometry.isna()]\n", + "try:\n", + " predictions_df[\"geometry\"] = predictions_df[\"geometry\"].apply(wkt.loads)\n", + "except:\n", + " pass\n", + "pred_gpd = gpd.GeoDataFrame(predictions_df,geometry=\"geometry\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9d745a36-0221-450f-9c35-b5c099a8d189", + "metadata": {}, + "outputs": [], + "source": [ + "annotation_names = []\n", + "annotation_geometries = []\n", + "with open(geojson_path) as f:\n", + " regions = json.load(f)\n", + "for region in regions[\"features\"]:\n", + " ann_type = region[\"properties\"][\"objectType\"]\n", + " if ann_type == \"annotation\":\n", + " annotation_name = region[\"properties\"][\"classification\"][\"name\"]\n", + " if annotation_name in [\"Region*\"]:\n", + " continue\n", + " annotation_geometries.append(shape(region[\"geometry\"]))\n", + " annotation_names.append(annotation_name)\n", + "annotations_gpd = gpd.GeoDataFrame({\"geometry\": annotation_geometries, \"label\": annotation_names})\n", + "annotations_gpd[\"ann_ix\"] = [f\"ID_{i}\" for i in range(len(annotations_gpd))]\n", + "cells_within_ann_gpd = gpd.sjoin(pred_gpd[[\"cell_type\", \"cell_x\", \"cell_y\", \"geometry\", \"id\"]], annotations_gpd, how='left', predicate='within')\n", + "cells_within_ann_gpd = cells_within_ann_gpd.drop_duplicates(subset=[\"id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "099d30dc-a404-4662-8fec-7b0275079e42", + "metadata": {}, + "outputs": [], + "source": [ + "for annotation_name in annotation_names:\n", + " df = cells_within_ann_gpd[cells_within_ann_gpd.label == annotation_name]\n", + " # df = df[~(df.cell_type == \"unclassified\")]\n", + " df = df.groupby([\"cell_type\"]).agg(\"count\").reset_index()\n", + " df = df.sort_values(\"id\", ascending=False)\n", + " fig = px.bar(df, x='cell_type', y='id', title=f\"Region: {annotation_name}\")\n", + " fig.update_layout(\n", + " xaxis_title=\"cell type\", yaxis_title=\"# cells\"\n", + " )\n", + " fig.show()\n", + " fig.write_html(os.path.join(results_eval_dir, f\"{method}_{annotation_name}_cell_counts.html\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a3204e-89bd-4e6b-b14a-ea98a8fe9f5d", + "metadata": {}, + "outputs": [], + "source": [ + "results_eval_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "159a4cc2-32a4-49d9-9a87-f186a0d255de", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..3471035 --- /dev/null +++ b/src/main.py @@ -0,0 +1,25 @@ +""" +Created By : ... +Created Date: DD/MM/YYYY +Description : ... +""" + +import argparse +from utils.logging import get_logger + + +APP_NAME = 'MyProject' +LOGGER = get_logger(APP_NAME) + + +def dummy(dum): + """Example function + + :param dum: Text to log. + :type number: str + :return: The entry text. + :rtype: str + """ + LOGGER.info(f'{dum} in progress') + return dum + diff --git a/src/synthetic_data/generate_synthetic_data_Xenium.ipynb b/src/synthetic_data/generate_synthetic_data_Xenium.ipynb new file mode 100644 index 0000000..c14a4fc --- /dev/null +++ b/src/synthetic_data/generate_synthetic_data_Xenium.ipynb @@ -0,0 +1,876 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "018887fd-e9e8-495b-872f-fefbd9cd6cb5", + "metadata": {}, + "source": [ + "To generate synthetic VisiumHD data from Xenium, please read and run all the cells below. Thanks!" + ] + }, + { + "cell_type": "markdown", + "id": "bd0c610b-e1b5-43e6-a35d-3548588cb652", + "metadata": {}, + "source": [ + "## Download Xenium output from 10X website\n", + "Paste the URL for the binned_outputs.tar.gz for the sample you want to analyze.\n", + "\n", + "1. Go to Xenium public datasets page:https://www.10xgenomics.com/datasets?query=&page=1&configure%5BhitsPerPage%5D=50&configure%5BmaxValuesPerFacet%5D=1000&refinementList%5Bproduct.name%5D%5B0%5D=In%20Situ%20Gene%20Expression&refinementList%5Bspecies%5D%5B0%5D=Human&refinementList%5BdiseaseStates%5D%5B0%5D=colorectal%20cancer\n", + "\n", + "2. Select sample to analyze scrolling down to downloads section, click \"Batch download\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f721b2b-4314-4528-9c01-185726147728", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "xenium_outputs_url = \"https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Colorectal_Cancer_Addon_FFPE/Xenium_V1_Human_Colorectal_Cancer_Addon_FFPE_outs.zip\"\n", + "# Step 1: Download the raw Xenium output\n", + "!curl -O {xenium_outputs_url}\n", + "\n", + "# Extract the ZIP file\n", + "zip_file_path = xenium_outputs_url.split(\"/\")[-1]\n", + "\n", + "with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n", + " zip_ref.extractall(\"extracted_files\")\n", + "\n", + "print(\"Extraction completed.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a9fcd48a-2f55-43b4-befd-8d646ea634cf", + "metadata": {}, + "source": [ + "### Install prerequisite libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7453e3e3-a55c-47fb-ab83-2c3743833b89", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install --upgrade pip\n", + "!pip install scipy\n", + "!pip install shapely\n", + "!pip install tifffile\n", + "!pip install plotly\n", + "!pip install tensorflow-gpu==2.10.0\n", + "!pip install stardist\n", + "!pip install geopandas\n", + "!pip install scanpy\n", + "!pip install fastparquet\n", + "!pip install opencv-python\n", + "!pip install geojson\n", + "!pip install scikit-learn" + ] + }, + { + "cell_type": "markdown", + "id": "1f79fb2c-0fd9-4bd4-8be9-4d1bd04d8733", + "metadata": {}, + "source": [ + "### Import Relevant Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16e4dc02-2b8d-4e00-9cbd-8a4d151ca5af", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "import geopandas as gpd # Geopandas for storing Shapely objects\n", + "from matplotlib.colors import ListedColormap\n", + "import matplotlib.pyplot as plt\n", + "import scanpy as sc\n", + "import pandas as pd\n", + "from scipy import sparse\n", + "import anndata\n", + "import os\n", + "import gzip\n", + "import numpy as np\n", + "import re\n", + "import shapely\n", + "from shapely.geometry import Polygon, Point # Representing bins and cells as Shapely Polygons and Point objects\n", + "from shapely import wkt" + ] + }, + { + "cell_type": "markdown", + "id": "46a8d90a-65dd-4e93-b4e2-4a257d6e1dc7", + "metadata": {}, + "source": [ + "### Load Cell & Transcripts Info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "feb54b91-6757-467c-81d3-7a4f6916fcda", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the transcript data\n", + "transcripts_path = \"extracted_files/transcripts.csv.gz\"\n", + "with gzip.open(transcripts_path, 'rt') as f:\n", + " transcripts_df = pd.read_csv(f)\n", + "\n", + "# Load cell info\n", + "cells_path = \"extracted_files/cells.csv.gz\"\n", + "with gzip.open(cells_path, 'rt') as f:\n", + " cells_df = pd.read_csv(f)\n" + ] + }, + { + "cell_type": "markdown", + "id": "ccac1dea-7855-4af4-8989-c2b63deed2f1", + "metadata": {}, + "source": [ + "### Load Cell Boundary Info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25d2bdf0-8871-4bb0-a38e-3f9c31c7b3ea", + "metadata": {}, + "outputs": [], + "source": [ + "import zarr\n", + "\n", + "zarr_file = zarr.open('extracted_files/cells.zarr.zip', mode='r')\n", + "print(zarr_file.tree())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c092c013-dd0d-47f5-a6cc-3491f1f62dfe", + "metadata": {}, + "outputs": [], + "source": [ + "file = zarr_file['polygon_sets/0/vertices'][:]\n", + "# 1 is whole cell, 0 is nucleus" + ] + }, + { + "cell_type": "markdown", + "id": "0da5ff74-6269-42b4-9a9e-604f520a7528", + "metadata": { + "tags": [] + }, + "source": [ + "### Create folders to store synthetic data" + ] + }, + { + "cell_type": "markdown", + "id": "a6839176-4a75-4f1e-b4f7-13899b946963", + "metadata": {}, + "source": [ + "For both the `seqfish_dir` and `enact_data_dir`, change `\"/home/oneai/\"` to the directory that stores this repo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ec69f53-4a93-491a-b6f0-652b27ffaaf1", + "metadata": {}, + "outputs": [], + "source": [ + "xenium_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/synthetic_data/xenium\" # Update it to the directory where you want to save the synthetic data\n", + "enact_data_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/xenium_nuclei/chunks\" # Directory that saves all the input and results of the enact pipeline, \n", + "# should end with \"oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\"\n", + "\n", + "transcripts_df_chunks_dir = os.path.join(xenium_dir, \"transcripts_patches\") # Directory to store the files that contain the transcripts info for each chunk\n", + "output_dir = os.path.join(enact_data_dir, \"bins_gdf\") # Directory to store the results of gene-to-bin assignment for each chunk\n", + "cells_df_chunks_dir = os.path.join(enact_data_dir,\"cells_gdf\") \n", + "ground_truth_dir = os.path.join(xenium_dir, \"ground_truth_nuclei\")\n", + "\n", + "# Making relevant directories\n", + "os.makedirs(xenium_dir, exist_ok=True)\n", + "os.makedirs(enact_data_dir, exist_ok=True)\n", + "os.makedirs(transcripts_df_chunks_dir, exist_ok=True)\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "os.makedirs(cells_df_chunks_dir, exist_ok=True)\n", + "os.makedirs(ground_truth_dir, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "dafe70a1-ed23-4cb6-a7b6-d35e4c01f895", + "metadata": {}, + "source": [ + "### Generate Synthetic VesiumHD Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "5bdd8461-7bcc-4101-b26b-765daf975916", + "metadata": { + "tags": [] + }, + "source": [ + "#### Break transcripts df to patches (based on location)" + ] + }, + { + "cell_type": "markdown", + "id": "042b4ce0-30d1-4c23-9b2d-0622db0a4f8c", + "metadata": {}, + "source": [ + "Break transcripts df to patches of size 1000um x 1000um (larger patch size may result in memory issue)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60fb886a-5893-40ba-b187-650d6cfb4ed6", + "metadata": {}, + "outputs": [], + "source": [ + "# patch size: 1000 um x 1000 um\n", + "\n", + "patch_size = 1000\n", + "\n", + "# patch indices\n", + "transcripts_df['x_patch'] = (transcripts_df['x_location'] // patch_size).astype(int)\n", + "transcripts_df['y_patch'] = (transcripts_df['y_location'] // patch_size).astype(int)\n", + "transcripts_df[\"patch_id\"] = transcripts_df[\"x_patch\"].astype(str) + \"_\" + transcripts_df[\"y_patch\"].astype(str)\n", + "\n", + "# Create a df for each patch\n", + "grouped = transcripts_df.groupby(['x_patch', 'y_patch'])\n", + "for (x_patch, y_patch), group in grouped:\n", + " # Calculate the start and end locations for each patch\n", + " # x_start = x_patch * patch_size\n", + " # x_end = (x_patch + 1) * patch_size\n", + " # y_start = y_patch * patch_size\n", + " # y_end = (y_patch + 1) * patch_size\n", + " \n", + " filename = f\"patch_{x_patch}_{y_patch}.csv\"\n", + " output_loc = os.path.join(transcripts_df_patch_dir , filename)\n", + " group.to_csv(output_loc)\n", + "\n", + " print(f\"Saved {filename}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a7bbc9ec-675b-4b25-8448-334ed317798a", + "metadata": { + "tags": [] + }, + "source": [ + "#### Generate synthetic vesiumHD for each patch" + ] + }, + { + "cell_type": "markdown", + "id": "ceebc8fd-88e9-4b14-8470-a474085dee64", + "metadata": {}, + "source": [ + "Each patch is broken into bins of size 2um x 2um. The synthetic data contains transcript counts orgnized by bin_id. Each row contains transcript counts for a unique bin. Bins with no transcript counts is not included. \n", + "\n", + "In addition to all the gene features, there are two additional columns represent the row number and column number of the bin, and a column contains the Shapely polygon item that represents the bin. The first column is the bin_id." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d19155a0-5646-49bd-915c-94737e251bb0", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_synthetic_VesiumHD_data(transcripts_df, bin_size=2, whole_cell=True, QScore20=True):\n", + " filtered_df = transcripts_df.copy()\n", + " # only count transcripts in the nucleus\n", + " if not whole_cell:\n", + " filtered_df = transcripts_df[transcripts_df['overlaps_nucleus'] == 1].copy()\n", + " \n", + " only count transcripts with QScore >= 20\n", + " if QScore20:\n", + " filtered_df = filtered_df[filtered_df['qv'] >= 20].copy()\n", + " \n", + " # assigne bin to each transcript\n", + " filtered_df.loc[:, 'row'] =np.ceil(filtered_df['y_location'] / bin_size).astype(int)\n", + " filtered_df.loc[:, 'column'] = np.ceil(filtered_df['x_location'] / bin_size).astype(int)\n", + " filtered_df.loc[:, 'assigned_bin_id'] = filtered_df.apply(\n", + " lambda row: f\"{bin_size}um_\" + str(row['row']).zfill(5) +\"_\"+ str(row['column']).zfill(5),\n", + " axis=1)\n", + " \n", + " bin_coordinates = filtered_df[['assigned_bin_id', 'row', 'column']].drop_duplicates().set_index('assigned_bin_id')\n", + " bin_gene_matrix = filtered_df.groupby(['assigned_bin_id', 'feature_name']).size().unstack(fill_value=0)\n", + " bin_gene_matrix_with_coords = bin_gene_matrix.merge(bin_coordinates, left_index=True, right_index=True)\n", + " \n", + " return bin_gene_matrix_with_coords" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd804c49-dc85-4fa9-85d4-a621cf0598ae", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract row and column number from the bin_id\n", + "def extract_numbers(entry):\n", + " match = re.search(r'_(\\d{5})_(\\d{5})', entry)\n", + " if match:\n", + " number1 = int(match.group(1).lstrip('0')) \n", + " number2 = int(match.group(2).lstrip('0')) \n", + " return number2*2-1, number1*2-1\n", + " else:\n", + " return None, None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8d45c22-2776-4b80-a29b-37d07f6b06c5", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "def generate_bin_polys(bins_df, x_col, y_col, bin_size):\n", + " \"\"\"Represents the bins as Shapely polygons\n", + "\n", + " Args:\n", + " bins_df (pd.DataFrame): bins dataframe\n", + " x_col (str): column with the bin centre x-coordinate\n", + " y_col (str): column with the bin centre y-coordinate\n", + " bin_size (int): bin size in pixels\n", + "\n", + " Returns:\n", + " list: list of Shapely polygons\n", + " \"\"\"\n", + " geometry = []\n", + " # Generates Shapely polygons to represent each bin\n", + "\n", + " if True:\n", + " half_bin_size = bin_size / 2\n", + " bbox_coords = pd.DataFrame(\n", + " {\n", + " \"min_x\": bins_df[x_col] - half_bin_size,\n", + " \"min_y\": bins_df[y_col] - half_bin_size,\n", + " \"max_x\": bins_df[x_col] + half_bin_size,\n", + " \"max_y\": bins_df[y_col] + half_bin_size,\n", + " }\n", + " )\n", + " geometry = [\n", + " shapely.geometry.box(min_x, min_y, max_x, max_y)\n", + " for min_x, min_y, max_x, max_y in tqdm(\n", + " zip(\n", + " bbox_coords[\"min_x\"],\n", + " bbox_coords[\"min_y\"],\n", + " bbox_coords[\"max_x\"],\n", + " bbox_coords[\"max_y\"],\n", + " ),\n", + " total=len(bins_df),\n", + " )\n", + " ]\n", + "\n", + " return geometry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f1c4071-ff50-4ec1-bd0d-37c8ddecaa54", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Loop through all the transcripra_df chunks and generate gene-to-bin assignments \n", + "patch_size = 1000\n", + "bin_size = 2\n", + "transcripts_df_chunks = os.listdir(transcripts_df_patch_dir)\n", + "for chunk_fname in transcripts_df_chunks:\n", + " output_loc = os.path.join(output_dir, chunk_fname)\n", + " # if os.path.exists(output_loc):\n", + " # continue\n", + " if chunk_fname in [\".ipynb_checkpoints\"]:\n", + " continue\n", + " transcripts_df_chunk = pd.read_csv(os.path.join(transcripts_df_patch_dir, chunk_fname))\n", + " bin_df_chunk = generate_synthetic_VesiumHD_data(transcripts_df_chunk, bin_size, whole_cell=True, QScore20=True)\n", + " bin_df_chunk['column'] = bin_df_chunk['column']*2-1\n", + " bin_df_chunk['row'] = bin_df_chunk['row']*2-1\n", + " bin_df_chunk['geometry'] = generate_bin_polys(bin_df_chunk, 'column', 'row', 2)\n", + " bin_gdf_chunk = gpd.GeoDataFrame( bin_df_chunk, geometry = bin_df_chunk['geometry'])\n", + " bin_df_chunk.to_csv(output_loc)\n", + " print(f\"Successfully assigned transcripts to bins for {chunk_fname}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "105e310d-2a9d-41b5-9450-23ab3e57e7f7", + "metadata": { + "tags": [] + }, + "source": [ + "### Generate cell_gdf as enact_pipeline input" + ] + }, + { + "cell_type": "markdown", + "id": "428d33fd-45be-4dde-b4b9-acc3de13f9e0", + "metadata": {}, + "source": [ + "This session generate the cell_df patches required to run the enact pipeline. The main purpose is to create Shapely polygons that represent the cell outline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a4bff77-1b7a-4921-a4c2-0b66cf800468", + "metadata": {}, + "outputs": [], + "source": [ + "def create_polygons(coords_array):\n", + " polygons = []\n", + " for row in coords_array:\n", + " reshaped_coords = row.reshape(-1, 2)\n", + " polygon = Polygon(reshaped_coords)\n", + " polygons.append(polygon)\n", + " return polygons\n", + "\n", + "# Create the polygons\n", + "polygons = create_polygons(file)\n", + "cells_df['polygons'] = polygons" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22875d42-5489-4ed0-b370-d693f26318e9", + "metadata": {}, + "outputs": [], + "source": [ + "cell_gdf_chunk = gpd.GeoDataFrame(cells_df, geometry = cells_df['polygons'])\n", + "cell_gdf_chunk.rename(columns={'x_centroid': 'cell_x', 'y_centroid': 'cell_y'}, inplace=True)\n", + "cell_gdf_chunk.drop(\"Unnamed: 0\", axis=1, inplace=True)\n", + "cell_gdf_chunk[['cell_id','cell_x','cell_y','geometry']].to_csv(os.path.join(enact_data_dir, \"cells_gdf\"))" + ] + }, + { + "cell_type": "markdown", + "id": "5e38a13d-3dfa-45e2-abc3-1b40c382a1db", + "metadata": { + "tags": [] + }, + "source": [ + "### Run ENACT bin-to-cell pipeline\n", + "In the configs.yaml file: \n", + "\n", + " Set \"analysis_name\" in the configs.yaml file to \"xenium\" (or \"xenium_nuclei).\n", + " Set \"run_synthetic\" to True and all other steps to False.\n", + " Set \"bin_to_cell_method\" to one of these four: \"naive\", \"weighted_by_area\", \"weighted_by_gene\", or \"weighted_by_cluster\"\n", + "\n", + "Run `make run_enact`" + ] + }, + { + "cell_type": "markdown", + "id": "2ae8aa8e-0a17-48ae-86ed-81a04ec203dc", + "metadata": { + "tags": [] + }, + "source": [ + "### Generate Ground Truth" + ] + }, + { + "cell_type": "markdown", + "id": "670974eb-8dae-4d67-b735-1cd53858d560", + "metadata": {}, + "source": [ + "The following cell will generate and save the ground truth of the synthetic VisiumHD data for the use of bin-to-cell assignment methods evaluation. Ground truth dataframe consists of rows representing the transcript counts of each cell. Each column represents a gene feature (gene feature name is also the column name)." + ] + }, + { + "cell_type": "markdown", + "id": "8224ea02-5701-450c-9efb-c38de7492764", + "metadata": { + "tags": [] + }, + "source": [ + "#### Generate Cell-gene matrix for evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f23be59-86ef-4ed0-b9fd-b22b203fa769", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_ground_truth_table(transcripts_df, cells_df, whole_cell=True, QScore20=True, include_unassigned_transcript=False):\n", + " filtered_df = transcripts_df\n", + " \n", + " # only count transcripts in the nucleus\n", + " if not whole_cell:\n", + " filtered_df = transcripts_df[transcripts_df['overlaps_nucleus'] == 1]\n", + " \n", + " # only count transcripts with QScore >= 20\n", + " if QScore20:\n", + " filtered_df = filtered_df[filtered_df['qv'] >= 20]\n", + " \n", + " # only count transcripts that are assigned to specific cells\n", + " if not include_unassigned_transcript:\n", + " filtered_df = filtered_df[filtered_df['cell_id'] != 'UNASSIGNED']\n", + " \n", + " pivot_df = filtered_df.pivot_table(index='cell_id', columns='feature_name', aggfunc='size', fill_value=0)\n", + " \n", + " merged_df = pivot_df.merge(cells_df[['cell_id']], left_index=True, right_on='cell_id', how='right')\n", + " columns = ['cell_id'] + [col for col in merged_df.columns if col not in ['cell_id', 'x_centroid', 'y_centroid','polygons']]\n", + " merged_df = merged_df[columns]\n", + " merged_df.set_index('cell_id', inplace=True)\n", + " #merged_df['total_gene_counts'] = merged_df.iloc[:, 3:].sum(axis=1)\n", + " \n", + " return merged_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "389f2644-5496-4286-961c-fa74ea32e97f", + "metadata": {}, + "outputs": [], + "source": [ + "bin_size = 2\n", + "cell_df_chunks = os.listdir(cells_df_chunks_dir)\n", + "for chunk_fname in cell_df_chunks:\n", + " output_loc = os.path.join(ground_truth_dir,chunk_fname)\n", + " if os.path.exists(output_loc):\n", + " continue\n", + " if chunk_fname in [\".ipynb_checkpoints\"]:\n", + " continue\n", + " cell_df_chunk = pd.read_csv(os.path.join(cell_dir, chunk_fname))\n", + " groundtruth_chunk = generate_ground_truth_table(transcripts_df, cell_df_chunk, whole_cell=False, QScore20=False, include_unassigned_transcript=False)\n", + " groundtruth_chunk.to_csv(output_loc)\n", + " print(f\"Successfully generated groundthuth for {chunk_fname}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f7a648b8-c2d9-4489-951e-dc0c443b489d", + "metadata": { + "tags": [] + }, + "source": [ + "### Evaluation of ENACT bin-to-cell results" + ] + }, + { + "cell_type": "markdown", + "id": "3759f86f-8498-41b1-a7ea-ca934b102d22", + "metadata": { + "tags": [] + }, + "source": [ + "#### Overall precision, recall, and f1" + ] + }, + { + "cell_type": "markdown", + "id": "20f300f2-73fb-4c86-9bf0-704f053d5299", + "metadata": {}, + "source": [ + "Run this session with all the methods you have run with ENACT, change 'method' in the cell bellow to the one you want to evaluate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5061ee46-1591-4a96-8643-5e96d7c55a44", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "method = \"weighted_by_cluster\"\n", + "results_dir = os.path.join(enact_data_dir, method, \"bin_to_cell_assign\")\n", + "\n", + "# Initialize variables to accumulate weighted precision, recall, and F1\n", + "total_cells = 0\n", + "precision_sum = 0\n", + "recall_sum = 0\n", + "missing_cells_count = 0\n", + "total_cells_count = 0\n", + "results_chunks = os.listdir(results_dir)\n", + "\n", + "for chunk_fname in results_chunks:\n", + " if chunk_fname in [\".ipynb_checkpoints\"]:\n", + " continue\n", + "\n", + " generated = pd.read_csv(os.path.join(results_dir, chunk_fname))\n", + " ground_truth = pd.read_csv(os.path.join(ground_truth_dir, chunk_fname))\n", + " if len(generated) ==0:\n", + " print(chunk_fname)\n", + " continue\n", + " generated.rename(columns={'id': 'cell_id'}, inplace=True)\n", + " \n", + " # Align both dataframes by 'cell_id', filling missing cells in generated with 0\n", + " merged = pd.merge(ground_truth, generated, on='cell_id', how='left', suffixes=('_gt', '_gen')).fillna(0)\n", + " num_cells = (ground_truth.iloc[:, 1:] != 0).any(axis=1).sum()\n", + " missing_cells_count += num_cells - len(generated)\n", + " total_cells_count += num_cells\n", + "\n", + " ground_truth_aligned = merged.filter(like='_gt').values\n", + " generated_aligned = merged.filter(like='_gen').values\n", + " assert ground_truth_aligned.shape == generated_aligned.shape, \"Aligned matrices must have the same shape!\"\n", + "\n", + " num_cells = ground_truth_aligned.shape[0]\n", + "\n", + " # Compute precision for the current patch\n", + " patch_precision = np.sum(np.minimum(generated_aligned, ground_truth_aligned)) / np.sum(generated_aligned)\n", + "\n", + " # Compute recall for the current patch\n", + " patch_recall = np.sum(np.minimum(generated_aligned, ground_truth_aligned)) / np.sum(ground_truth_aligned)\n", + "\n", + " # F1 score for the current patch\n", + " if patch_precision + patch_recall > 0:\n", + " patch_f1 = 2 * (patch_precision * patch_recall) / (patch_precision + patch_recall)\n", + " else:\n", + " patch_f1 = 0\n", + "\n", + " # Accumulate the weighted precision, recall, and number of aligned cells\n", + " precision_sum += patch_precision * num_cells\n", + " recall_sum += patch_recall * num_cells\n", + " total_cells += num_cells\n", + " \n", + "# Compute overall weighted precision, recall, and F1 score\n", + "overall_precision = precision_sum / total_cells\n", + "overall_recall = recall_sum / total_cells\n", + "\n", + "if overall_precision + overall_recall > 0:\n", + " overall_f1_score = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall)\n", + "else:\n", + " overall_f1_score = 0 \n", + "\n", + "# Print results\n", + "print(f\"Overall Precision: {overall_precision}\")\n", + "print(f\"Overall Recall: {overall_recall}\")\n", + "print(f\"Overall F1 Score: {overall_f1_score}\")\n", + "print(f\"Total missing cells in the generated data compared to ground truth: {missing_cells_count}\")\n", + "print(f\"Total cells : {total_cells_count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "eef397d4-ce75-4459-869e-7141fb72ba79", + "metadata": { + "tags": [] + }, + "source": [ + "#### Visualize the distribution using violin plots " + ] + }, + { + "cell_type": "markdown", + "id": "e0b763a9-4dce-48c3-9e43-40d2fbfd7c88", + "metadata": {}, + "source": [ + "The following cells would create violin plots for all four methods in order to better compare the results. You can choose to only compare the ones you have run by changing the 'methods' list below to only include those." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5e2326d-d85e-4075-afa5-2edf492eef0b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Define methods and their directories\n", + "methods = [\n", + " {\n", + " 'name': 'Naive',\n", + " 'results_dir': os.path.join(enact_data_dir, \"naive\", \"bin_to_cell_assign\"), \n", + " 'ground_truth_dir':ground_truth_dir\n", + " },\n", + " {\n", + " 'name': 'Weighted_by_area',\n", + " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_area\", \"bin_to_cell_assign\"), \n", + " 'ground_truth_dir':ground_truth_dir\n", + " },\n", + " {\n", + " 'name': 'Weighted_by_gene',\n", + " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_gene\", \"bin_to_cell_assign\"), \n", + " 'ground_truth_dir': ground_truth_dir\n", + " },\n", + " {\n", + " 'name': 'Weighted_by_cluster',\n", + " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_cluster\", \"bin_to_cell_assign\"), \n", + " 'ground_truth_dir': ground_truth_dir\n", + " }\n", + "]\n", + "\n", + "# Initialize a list to store per-patch metrics for all methods\n", + "metrics_list = []\n", + "\n", + "# Loop through each method to compute per-patch metrics\n", + "for method in methods:\n", + " method_name = method['name']\n", + " results_dir = method['results_dir']\n", + " ground_truth_dir = method['ground_truth_dir']\n", + " \n", + " print(f\"Processing {method_name}...\")\n", + " \n", + " # Get list of generated and ground truth files\n", + " generated_files = [f for f in os.listdir(results_dir) if f.endswith('.csv') and f not in [\".ipynb_checkpoints\"]]\n", + " ground_truth_files = [f for f in os.listdir(ground_truth_dir) if f.endswith('.csv') and f not in [\".ipynb_checkpoints\"]]\n", + " \n", + " # Find common files between generated results and ground truth\n", + " common_files = set(generated_files) & set(ground_truth_files)\n", + " \n", + " if not common_files:\n", + " print(f\"No common files found for {method_name}. Skipping method.\")\n", + " continue\n", + " \n", + " # Loop through each common file (patch)\n", + " for fname in common_files:\n", + " ground_truth_path = os.path.join(ground_truth_dir, fname)\n", + " generated_path = os.path.join(results_dir, fname)\n", + " \n", + " # Load ground truth and generated data\n", + " ground_truth = pd.read_csv(ground_truth_path)\n", + " generated = pd.read_csv(generated_path)\n", + " \n", + " # Skip if generated data is empty\n", + " if generated.empty:\n", + " print(f\"No data in generated file {fname} for {method_name}. Skipping patch.\")\n", + " continue\n", + " \n", + " # Rename columns for consistency\n", + " if 'id' in generated.columns:\n", + " generated.rename(columns={'id': 'cell_id'}, inplace=True)\n", + " \n", + " # Merge ground truth and generated data on 'cell_id', filling missing values with 0\n", + " merged = pd.merge(\n", + " ground_truth, generated, on='cell_id', how='outer', suffixes=('_gt', '_gen')\n", + " ).fillna(0)\n", + " \n", + " # Extract aligned matrices for ground truth and generated data\n", + " ground_truth_aligned = merged.filter(regex='_gt$').values\n", + " generated_aligned = merged.filter(regex='_gen$').values\n", + " \n", + " # Ensure matrices are aligned\n", + " if ground_truth_aligned.shape != generated_aligned.shape:\n", + " print(f\"Shape mismatch in patch {fname} for {method_name}. Skipping patch.\")\n", + " continue\n", + " \n", + " # Compute counts for this patch\n", + " tp = np.sum(np.minimum(generated_aligned, ground_truth_aligned))\n", + " predicted = np.sum(generated_aligned)\n", + " actual = np.sum(ground_truth_aligned)\n", + " \n", + " # Compute metrics for this patch\n", + " precision = tp / predicted if predicted > 0 else 0\n", + " recall = tp / actual if actual > 0 else 0\n", + " f1_score = (\n", + " 2 * (precision * recall) / (precision + recall)\n", + " if (precision + recall) > 0 else 0\n", + " )\n", + " \n", + " # Store metrics for this patch\n", + " metrics_list.append({\n", + " 'Method': method_name,\n", + " 'Patch': fname,\n", + " 'Precision': precision,\n", + " 'Recall': recall,\n", + " 'F1 Score': f1_score\n", + " })\n", + "\n", + "# Create a DataFrame with per-patch metrics\n", + "metrics_df = pd.DataFrame(metrics_list)\n", + "\n", + "# Display the first few rows of the DataFrame\n", + "print(\"\\nPer-Patch Metrics:\")\n", + "print(metrics_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ad2f5b2-4b89-4480-85b5-6af2cc6bfb56", + "metadata": {}, + "outputs": [], + "source": [ + "# plotting\n", + "sns.set(style=\"whitegrid\")\n", + "\n", + "# Create a figure with subplots for each metric\n", + "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n", + "\n", + "# Precision Violin Plot\n", + "sns.violinplot(x='Method', y='Precision', data=metrics_df, ax=axes[0], inner='quartile', palette='Set2')\n", + "axes[0].set_title('Precision')\n", + "axes[0].set_xlabel('Method')\n", + "axes[0].set_ylabel('value')\n", + "axes[0].set_ylim(0,1)\n", + "axes[0].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", + "\n", + "# Recall Violin Plot\n", + "sns.violinplot(x='Method', y='Recall', data=metrics_df, ax=axes[1], inner='quartile', palette='Set2')\n", + "axes[1].set_title('Recall')\n", + "axes[1].set_xlabel('Method')\n", + "axes[1].set_ylabel('value')\n", + "axes[1].set_ylim(0,1)\n", + "axes[1].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", + "\n", + "# F1 Score Violin Plot\n", + "sns.violinplot(x='Method', y='F1 Score', data=metrics_df, ax=axes[2], inner='quartile', palette='Set2')\n", + "axes[2].set_title('F1 Score')\n", + "axes[2].set_xlabel('Method')\n", + "axes[2].set_ylabel('value')\n", + "axes[2].set_ylim(0,1)\n", + "axes[2].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/synthetic_data/generate_synthetic_data_seqFISH.ipynb b/src/synthetic_data/generate_synthetic_data_seqFISH.ipynb new file mode 100644 index 0000000..234b86f --- /dev/null +++ b/src/synthetic_data/generate_synthetic_data_seqFISH.ipynb @@ -0,0 +1,1027 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4d2cee82-d84e-4c57-af58-6ce961a3f819", + "metadata": {}, + "source": [ + "To generate synthetic VisiumHD data from seqFISH+, please read and run all the cells below. Thanks!" + ] + }, + { + "cell_type": "markdown", + "id": "a9fcd48a-2f55-43b4-befd-8d646ea634cf", + "metadata": { + "tags": [] + }, + "source": [ + "### Install prerequisite libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7453e3e3-a55c-47fb-ab83-2c3743833b89", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: pip in /opt/conda/lib/python3.10/site-packages (24.2)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (1.10.0)\n", + "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /home/oneai/.local/lib/python3.10/site-packages (from scipy) (1.22.4)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: shapely in /home/oneai/.local/lib/python3.10/site-packages (2.0.0)\n", + "Requirement already satisfied: numpy>=1.14 in /home/oneai/.local/lib/python3.10/site-packages (from shapely) (1.22.4)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: tifffile in /home/oneai/.local/lib/python3.10/site-packages (2022.10.10)\n", + "Requirement already satisfied: numpy>=1.19.2 in /home/oneai/.local/lib/python3.10/site-packages (from tifffile) (1.22.4)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: plotly in /home/oneai/.local/lib/python3.10/site-packages (5.13.1)\n", + "Requirement already satisfied: tenacity>=6.2.0 in /home/oneai/.local/lib/python3.10/site-packages (from plotly) (9.0.0)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: tensorflow-gpu==2.10.0 in /opt/conda/lib/python3.10/site-packages (2.10.0)\n", + "Requirement already satisfied: absl-py>=1.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.1.0)\n", + "Requirement already satisfied: astunparse>=1.6.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.6.3)\n", + "Requirement already satisfied: flatbuffers>=2.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (24.3.25)\n", + "Requirement already satisfied: gast<=0.4.0,>=0.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.4.0)\n", + "Requirement already satisfied: google-pasta>=0.1.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.2.0)\n", + "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.66.1)\n", + "Requirement already satisfied: h5py>=2.9.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.11.0)\n", + "Requirement already satisfied: keras<2.11,>=2.10.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.0)\n", + "Requirement already satisfied: keras-preprocessing>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.1.2)\n", + "Requirement already satisfied: libclang>=13.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (18.1.1)\n", + "Requirement already satisfied: numpy>=1.20 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.22.4)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.3.0)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (24.0)\n", + "Requirement already satisfied: protobuf<3.20,>=3.9.2 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.19.6)\n", + "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (69.5.1)\n", + "Requirement already satisfied: six>=1.12.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.16.0)\n", + "Requirement already satisfied: tensorboard<2.11,>=2.10 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.1)\n", + "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.37.1)\n", + "Requirement already satisfied: tensorflow-estimator<2.11,>=2.10.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.0)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.4.0)\n", + "Requirement already satisfied: typing-extensions>=3.6.6 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (4.12.2)\n", + "Requirement already satisfied: wrapt>=1.11.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.14.1)\n", + "Requirement already satisfied: wheel<1.0,>=0.23.0 in /opt/conda/lib/python3.10/site-packages (from astunparse>=1.6.0->tensorflow-gpu==2.10.0) (0.43.0)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.33.0)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.4.6)\n", + "Requirement already satisfied: markdown>=2.6.8 in /home/oneai/.local/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.7)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.31.0)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.6.1)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (1.8.1)\n", + "Requirement already satisfied: werkzeug>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.0.4)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (5.4.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.4.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (4.9)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.10/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.0.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (1.26.19)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2024.7.4)\n", + "Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/conda/lib/python3.10/site-packages (from werkzeug>=1.0.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.1.5)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /opt/conda/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.6.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.2.2)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: stardist in /home/oneai/.local/lib/python3.10/site-packages (0.9.1)\n", + "Requirement already satisfied: csbdeep>=0.8.0 in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.8.0)\n", + "Requirement already satisfied: scikit-image in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.19.3)\n", + "Requirement already satisfied: numba in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.55.2)\n", + "Requirement already satisfied: imageio in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (2.35.1)\n", + "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.22.4)\n", + "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.10.0)\n", + "Requirement already satisfied: matplotlib in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (3.6.2)\n", + "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.16.0)\n", + "Requirement already satisfied: tifffile in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (2022.10.10)\n", + "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (4.66.2)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (24.0)\n", + "Requirement already satisfied: pillow>=8.3.2 in /home/oneai/.local/lib/python3.10/site-packages (from imageio->stardist) (10.4.0)\n", + "Requirement already satisfied: llvmlite<0.39,>=0.38.0rc1 in /home/oneai/.local/lib/python3.10/site-packages (from numba->stardist) (0.38.1)\n", + "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from numba->stardist) (69.5.1)\n", + "Requirement already satisfied: networkx>=2.2 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-image->stardist) (3.3)\n", + "Requirement already satisfied: PyWavelets>=1.1.1 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-image->stardist) (1.6.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (1.2.1)\n", + "Requirement already satisfied: cycler>=0.10 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (4.53.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (1.4.5)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (3.1.4)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (2.9.0)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: geopandas in /home/oneai/.local/lib/python3.10/site-packages (0.12.2)\n", + "Requirement already satisfied: pandas>=1.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (1.5.2)\n", + "Requirement already satisfied: shapely>=1.7 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (2.0.0)\n", + "Requirement already satisfied: fiona>=1.8 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (1.9.6)\n", + "Requirement already satisfied: pyproj>=2.6.1.post1 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (3.6.1)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from geopandas) (24.0)\n", + "Requirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (24.2.0)\n", + "Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (2024.7.4)\n", + "Requirement already satisfied: click~=8.0 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (8.1.7)\n", + "Requirement already satisfied: click-plugins>=1.0 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (1.1.1)\n", + "Requirement already satisfied: cligj>=0.5 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (0.7.2)\n", + "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (1.16.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (2.9.0)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (2022.7.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (1.22.4)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: scanpy in /home/oneai/.local/lib/python3.10/site-packages (1.9.1)\n", + "Requirement already satisfied: anndata>=0.7.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.8.0)\n", + "Requirement already satisfied: numpy>=1.17.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.22.4)\n", + "Requirement already satisfied: matplotlib>=3.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.6.2)\n", + "Requirement already satisfied: pandas>=1.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.5.2)\n", + "Requirement already satisfied: scipy>=1.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.10.0)\n", + "Requirement already satisfied: seaborn in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.13.2)\n", + "Requirement already satisfied: h5py>=3 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.11.0)\n", + "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from scanpy) (4.66.2)\n", + "Requirement already satisfied: scikit-learn>=0.22 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.2.0)\n", + "Requirement already satisfied: statsmodels>=0.10.0rc2 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.14.2)\n", + "Requirement already satisfied: patsy in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.5.6)\n", + "Requirement already satisfied: networkx>=2.3 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.3)\n", + "Requirement already satisfied: natsort in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (8.4.0)\n", + "Requirement already satisfied: joblib in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.4.2)\n", + "Requirement already satisfied: numba>=0.41.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.55.2)\n", + "Requirement already satisfied: umap-learn>=0.3.10 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.5.6)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from scanpy) (24.0)\n", + "Requirement already satisfied: session-info in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.0.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (1.2.1)\n", + "Requirement already satisfied: cycler>=0.10 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (4.53.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (1.4.5)\n", + "Requirement already satisfied: pillow>=6.2.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (10.4.0)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (3.1.4)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (2.9.0)\n", + "Requirement already satisfied: llvmlite<0.39,>=0.38.0rc1 in /home/oneai/.local/lib/python3.10/site-packages (from numba>=0.41.0->scanpy) (0.38.1)\n", + "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from numba>=0.41.0->scanpy) (69.5.1)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0->scanpy) (2022.7.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-learn>=0.22->scanpy) (3.1.0)\n", + "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from patsy->scanpy) (1.16.0)\n", + "Requirement already satisfied: pynndescent>=0.5 in /home/oneai/.local/lib/python3.10/site-packages (from umap-learn>=0.3.10->scanpy) (0.5.13)\n", + "Requirement already satisfied: stdlib-list in /home/oneai/.local/lib/python3.10/site-packages (from session-info->scanpy) (0.10.0)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: fastparquet in /home/oneai/.local/lib/python3.10/site-packages (2024.5.0)\n", + "Requirement already satisfied: pandas>=1.5.0 in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (1.5.2)\n", + "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (1.22.4)\n", + "Requirement already satisfied: cramjam>=2.3 in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (2.8.3)\n", + "Requirement already satisfied: fsspec in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (2024.6.1)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from fastparquet) (24.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2.9.0)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2022.7.1)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas>=1.5.0->fastparquet) (1.16.0)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: imagecodecs in /home/oneai/.local/lib/python3.10/site-packages (2024.6.1)\n", + "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from imagecodecs) (1.22.4)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: zarr in /home/oneai/.local/lib/python3.10/site-packages (2.17.1)\n", + "Requirement already satisfied: asciitree in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.3.3)\n", + "Requirement already satisfied: numpy>=1.21.1 in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (1.22.4)\n", + "Requirement already satisfied: numcodecs>=0.10.0 in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.13.0)\n", + "Requirement already satisfied: fasteners in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.19)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (1.10.0)\n", + "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /home/oneai/.local/lib/python3.10/site-packages (from scipy) (1.22.4)\n", + "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", + "Requirement already satisfied: h5py in /home/oneai/.local/lib/python3.10/site-packages (3.11.0)\n", + "Requirement already satisfied: numpy>=1.17.3 in /home/oneai/.local/lib/python3.10/site-packages (from h5py) (1.22.4)\n" + ] + } + ], + "source": [ + "!pip install --upgrade pip\n", + "!pip install scipy\n", + "!pip install shapely\n", + "!pip install tifffile\n", + "!pip install plotly\n", + "!pip install tensorflow-gpu==2.10.0\n", + "!pip install stardist\n", + "!pip install geopandas\n", + "!pip install scanpy\n", + "!pip install fastparquet\n", + "!pip install imagecodecs\n", + "!pip install zarr\n", + "!pip install scipy\n", + "!pip install h5py" + ] + }, + { + "cell_type": "markdown", + "id": "1f79fb2c-0fd9-4bd4-8be9-4d1bd04d8733", + "metadata": { + "tags": [] + }, + "source": [ + "### Import Relevant Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16e4dc02-2b8d-4e00-9cbd-8a4d151ca5af", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "import tifffile as tifi # Package to read the WSI (whole slide image)\n", + "from csbdeep.utils import normalize # Image normalization\n", + "from shapely.geometry import Polygon, Point # Representing bins and cells as Shapely Polygons and Point objects\n", + "from shapely import wkt\n", + "import geopandas as gpd # Geopandas for storing Shapely objects\n", + "from matplotlib.colors import ListedColormap\n", + "import matplotlib.pyplot as plt\n", + "import scanpy as sc\n", + "import pandas as pd\n", + "from scipy import sparse\n", + "import anndata\n", + "import os\n", + "import gzip\n", + "import numpy as np\n", + "import re\n", + "import shapely\n", + "import zarr\n" + ] + }, + { + "cell_type": "markdown", + "id": "a91a092e-781d-4a9e-8777-d3bb9c99309c", + "metadata": { + "tags": [] + }, + "source": [ + "### Create folders to store synthetic data" + ] + }, + { + "cell_type": "markdown", + "id": "37e30a8b-77f8-4d2c-97c2-8274eb0d23a3", + "metadata": {}, + "source": [ + "For both the `seqfish_dir` and `enact_data_dir`, change `\"/home/oneai/\"` to the directory that stores this repo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01f77ecd-3f9a-4a39-bbb2-e90e851ec360", + "metadata": {}, + "outputs": [], + "source": [ + "seqfish_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/synthetic_data/seqFISH\" # Update it to the directory where you want to save the synthetic data\n", + "enact_data_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\" # Directory that saves all the input and results of the enact pipeline, \n", + "# should end with \"oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\"\n", + "\n", + "transcripts_df_chunks_dir = os.path.join(seqfish_dir, \"transcripts_patches\") # Directory to store the files that contain the transcripts info for each chunk\n", + "output_dir = os.path.join(enact_data_dir, \"bins_gdf\") # Directory to store the generated synthetic binned transcript counts\n", + "cells_df_chunks_dir = os.path.join(enact_data_dir,\"cells_gdf\") # Directory to store the generated synthetic binned transcript counts\n", + "\n", + "# Making relevant directories\n", + "os.makedirs(seqfish_dir, exist_ok=True)\n", + "os.makedirs(enact_data_dir, exist_ok=True)\n", + "os.makedirs(transcripts_df_chunks_dir, exist_ok=True)\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "os.makedirs(cells_df_chunks_dir, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0048c41f-18ee-4b92-b7ea-680956330667", + "metadata": { + "tags": [] + }, + "source": [ + "### Download seqFISH+ data\n", + "\n", + "1. Download \"ROIs_Experiment1_NIH3T3.zip\" from https://zenodo.org/records/2669683#.Xqi1w5NKg6g to seqfish_dir. The zipfile contains cell segmentation files\n", + "2. Download \"run1.csv.gz\" from https://github.com/MonashBioinformaticsPlatform/seqfish-hack. It contains the tidy format of \"seqFISH+_NIH3T3_point_locations.zip\" from the official seqFISH+ zenodo site" + ] + }, + { + "cell_type": "markdown", + "id": "46a8d90a-65dd-4e93-b4e2-4a257d6e1dc7", + "metadata": { + "tags": [] + }, + "source": [ + "### Load Cell & Transcripts Info" + ] + }, + { + "cell_type": "markdown", + "id": "a40feb4c-1510-4222-bdec-a5e419758f32", + "metadata": {}, + "source": [ + "This following cells first unzip \"ROIs_Experiment1_NIH3T3.zip\" to extract the cell segmentation information. Then load transcripts dataframe from \"run1.csv.gz\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7bb6152-3999-4ccf-9a08-8fad268ab972", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "import os\n", + "zip_file_path = os.path.join(seqfish_dir, \"ROIs_Experiment1_NIH3T3.zip\")\n", + "\n", + "# Open the ZIP file and extract all the contents\n", + "with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n", + " zip_ref.extractall(seqfish_dir)\n", + "\n", + "print(f'Files extracted to {seqfish_dir}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "062ee054-6e0e-4c2b-9782-9e2b328c18e0", + "metadata": {}, + "outputs": [], + "source": [ + "file_path = os.path.join(seqfish_dir, \"run1.csv.gz\")\n", + "\n", + "transcripts_df = pd.read_csv(file_path, compression='gzip')\n", + "print(transcripts_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb2be572-8903-4539-8306-087cf61aa82d", + "metadata": {}, + "outputs": [], + "source": [ + "# convert from pixel to um\n", + "transcripts_df.x = transcripts_df.x*0.103\n", + "transcripts_df.y = transcripts_df.y*0.103\n", + "# label cell to include fov and cell number\n", + "transcripts_df['new_cell_name'] = transcripts_df.apply(lambda x: f\"{x['fov']}_Cell_{x['cell']}\", axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "5c8134a2-9a2b-41b6-81ab-4e292609e2f2", + "metadata": { + "tags": [] + }, + "source": [ + "### Generate Ground Truth" + ] + }, + { + "cell_type": "markdown", + "id": "9ae7d39d-f065-454a-bea6-7d31f57139fd", + "metadata": {}, + "source": [ + "The following cell will generate and save the ground truth of the synthetic VisiumHD data for the use of bin-to-cell assignment methods evaluation. Ground truth dataframe consists of rows representing the transcript counts of each cell. Each column represents a gene feature (gene feature name is also the column name)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9bc5483-2357-40aa-a5b8-1a140b08967a", + "metadata": {}, + "outputs": [], + "source": [ + "groundtruth_df = transcripts_df.pivot_table(index=['new_cell_name'], columns='gene', aggfunc='size', fill_value=0)\n", + "ground_truth_file = os.path.join(seqfish_dir, \"groundtruth.csv\")\n", + "groundtruth_df.to_csv(ground_truth_file)" + ] + }, + { + "cell_type": "markdown", + "id": "dafe70a1-ed23-4cb6-a7b6-d35e4c01f895", + "metadata": { + "tags": [] + }, + "source": [ + "### Generate Synthetic VesiumHD Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "5bdd8461-7bcc-4101-b26b-765daf975916", + "metadata": { + "tags": [] + }, + "source": [ + "#### Break transcripts df to patches (based on fov)" + ] + }, + { + "cell_type": "markdown", + "id": "b0d353b0-74cc-44f1-9a5b-ab5533d5d76a", + "metadata": {}, + "source": [ + "Break transcripts df to patches based on their field of view (fov), since cell segmentation is done on each individual fov seperately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60fb886a-5893-40ba-b187-650d6cfb4ed6", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a df for each fov\n", + "grouped = transcripts_df.groupby(['fov'])\n", + "for fov, group in grouped:\n", + " filename = f\"patch_{fov}.csv\"\n", + " output_loc = os.path.join(transcripts_df_chunks_dir, filename)\n", + " group.to_csv(output_loc)\n", + "\n", + " print(f\"Saved {filename}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a7bbc9ec-675b-4b25-8448-334ed317798a", + "metadata": { + "tags": [] + }, + "source": [ + "#### Generate synthetic vesiumHD for each patch" + ] + }, + { + "cell_type": "markdown", + "id": "99052790-7e12-4851-b9a4-e9ead3a55d0f", + "metadata": {}, + "source": [ + "Each fov is broken into bins of size 2um x 2um. The synthetic data contains transcript counts orgnized by bin_id. Each row contains transcript counts for a unique bin. Bins with no transcript counts is not included. \n", + "\n", + "In addition to all the gene features, there are two additional columns represent the row number and column number of the bin, and a column contains the Shapely polygon item that represents the bin. The first column is the bin_id." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d19155a0-5646-49bd-915c-94737e251bb0", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_synthetic_VesiumHD_data(transcripts_df, bin_size=2):\n", + " \n", + " filtered_df = transcripts_df.copy()\n", + " \n", + " # assigne bin to each transcript\n", + " filtered_df.loc[:, 'row'] =np.ceil(filtered_df['y'] / bin_size).astype(int)\n", + " filtered_df.loc[:, 'column'] = np.ceil(filtered_df['x'] / bin_size).astype(int)\n", + " filtered_df.loc[:, 'assigned_bin_id'] = filtered_df.apply(\n", + " lambda row: f\"{bin_size}um_\" + str(row['row']).zfill(5) +\"_\"+ str(row['column']).zfill(5),\n", + " axis=1)\n", + " bin_coordinates = filtered_df[['assigned_bin_id', 'row', 'column']].drop_duplicates().set_index('assigned_bin_id')\n", + " bin_gene_matrix = filtered_df.groupby(['assigned_bin_id', 'gene']).size().unstack(fill_value=0)\n", + " bin_gene_matrix_with_coords = bin_gene_matrix.merge(bin_coordinates, left_index=True, right_index=True)\n", + " return bin_gene_matrix_with_coords" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd804c49-dc85-4fa9-85d4-a621cf0598ae", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract row and column number from the bin_id\n", + "def extract_numbers(entry):\n", + " match = re.search(r'_(\\d{5})_(\\d{5})', entry)\n", + " if match:\n", + " number1 = int(match.group(1).lstrip('0')) \n", + " number2 = int(match.group(2).lstrip('0')) \n", + " return number2*2-1, number1*2-1\n", + " else:\n", + " return None, None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee921e47-70e4-4bee-92e3-6ce40a0fb50d", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "def generate_bin_polys(bins_df, x_col, y_col, bin_size):\n", + " \"\"\"Represents the bins as Shapely polygons\n", + "\n", + " Args:\n", + " bins_df (pd.DataFrame): bins dataframe\n", + " x_col (str): column with the bin centre x-coordinate\n", + " y_col (str): column with the bin centre y-coordinate\n", + " bin_size (int): bin size in pixels\n", + "\n", + " Returns:\n", + " list: list of Shapely polygons\n", + " \"\"\"\n", + " geometry = []\n", + " # Generates Shapely polygons to represent each bin\n", + "\n", + " if True:\n", + " half_bin_size = bin_size / 2\n", + " bbox_coords = pd.DataFrame(\n", + " {\n", + " \"min_x\": bins_df[x_col] - half_bin_size,\n", + " \"min_y\": bins_df[y_col] - half_bin_size,\n", + " \"max_x\": bins_df[x_col] + half_bin_size,\n", + " \"max_y\": bins_df[y_col] + half_bin_size,\n", + " }\n", + " )\n", + " geometry = [\n", + " shapely.geometry.box(min_x, min_y, max_x, max_y)\n", + " for min_x, min_y, max_x, max_y in tqdm(\n", + " zip(\n", + " bbox_coords[\"min_x\"],\n", + " bbox_coords[\"min_y\"],\n", + " bbox_coords[\"max_x\"],\n", + " bbox_coords[\"max_y\"],\n", + " ),\n", + " total=len(bins_df),\n", + " )\n", + " ]\n", + "\n", + " return geometry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f1c4071-ff50-4ec1-bd0d-37c8ddecaa54", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Loop through all the transcripra_df patches and generate gene-to-bin assignments \n", + "bin_size = 2\n", + "transcripts_df_chunks = os.listdir(transcripts_df_chunks_dir)\n", + "for chunk_fname in transcripts_df_chunks:\n", + " output_loc = os.path.join(output_dir, chunk_fname)\n", + " if chunk_fname in [\".ipynb_checkpoints\"]:\n", + " continue\n", + " # if os.path.exists(output_loc):\n", + " # continue\n", + " transcripts_df_chunk = pd.read_csv(os.path.join(transcripts_df_chunks_dir, chunk_fname))\n", + " bin_df_chunk = generate_synthetic_VesiumHD_data(transcripts_df_chunk, bin_size)\n", + " bin_df_chunk['column'] = bin_df_chunk['column']*2-1\n", + " bin_df_chunk['row'] = bin_df_chunk['row']*2-1\n", + " bin_df_chunk['geometry'] = generate_bin_polys(bin_df_chunk, 'column', 'row', 2)\n", + " bin_gdf_chunk = gpd.GeoDataFrame( bin_df_chunk, geometry = bin_df_chunk['geometry'])\n", + " bin_gdf_chunk.to_csv(output_loc)\n", + " \n", + " print(f\"Successfully assigned transcripts to bins for {chunk_fname}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2ae8aa8e-0a17-48ae-86ed-81a04ec203dc", + "metadata": { + "tags": [] + }, + "source": [ + "### Generate ENACT pipeline cell segmentation input" + ] + }, + { + "cell_type": "markdown", + "id": "a2fc8e57-23d4-4f71-971b-6e4e1d9f0267", + "metadata": {}, + "source": [ + "This session generate the cell_df patches required to run the enact pipeline. The main purpose is to create Shapely polygons that represent the cell outline." + ] + }, + { + "cell_type": "markdown", + "id": "57c34d0c-029c-482f-bc27-fc39e52adf4a", + "metadata": { + "tags": [] + }, + "source": [ + "#### Load cell boundary data and create cell polygons" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b140bc6d-f120-4d18-b302-844bb3b79a63", + "metadata": {}, + "outputs": [], + "source": [ + "import read_roi\n", + "def process_roi_file(key, roi_file_path):\n", + " roi_data = read_roi.read_roi_file(roi_file_path)\n", + " data = roi_data[key]\n", + " # Apply the scaling factor to each coordinate separately\n", + " scaled_x = [x * 0.103 for x in data['x']]\n", + " scaled_y = [y * 0.103 for y in data['y']]\n", + " # Create the list of points using zip on the scaled coordinates\n", + " points = [(x, y) for x, y in zip(scaled_x, scaled_y)]\n", + " # Create and return the polygon\n", + " polygon = Polygon(points)\n", + " return polygon" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5295212-8548-44a1-b15f-1234bdf28b88", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_fov_from_string(s):\n", + " # Search for one or more digits in the string\n", + " match = re.search(r'\\d+', s)\n", + " if match:\n", + " return int(match.group(0))+1 # Convert the found number to an integer\n", + " else:\n", + " return None # Return None if no number is found" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef3e532-d471-4635-b743-947c402dbe35", + "metadata": {}, + "outputs": [], + "source": [ + "base_path = os.path.join(seqfish_dir, \"ALL_Roi\") # Change this to the path where your fov folders are stored\n", + "fov_data = []\n", + "\n", + "for fov_folder in os.listdir(base_path):\n", + " fov_folder_path = os.path.join(base_path, fov_folder)\n", + " if os.path.isdir(fov_folder_path):\n", + " # Loop through each ROI file in the fov folder\n", + " for roi_file in os.listdir(fov_folder_path):\n", + " if roi_file.endswith('.roi'):\n", + " key = roi_file.replace('.roi', '')\n", + " roi_file_path = os.path.join(fov_folder_path, roi_file)\n", + " polygon = process_roi_file(key, roi_file_path)\n", + " fov_data.append({\n", + " 'fov': extract_fov_from_string(fov_folder),\n", + " 'cell': roi_file.replace('.roi', ''),\n", + " 'geometry': polygon\n", + " })\n", + "\n", + "cell_boundary_df = pd.DataFrame(fov_data)" + ] + }, + { + "cell_type": "markdown", + "id": "1f3b01b4-c042-4e70-9dd0-7ef88741b833", + "metadata": { + "tags": [] + }, + "source": [ + "#### relabel cell name of polygons df to the standard name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c59f4d-6fce-4702-861a-176516f518b3", + "metadata": {}, + "outputs": [], + "source": [ + "df_sorted = cell_boundary_df.sort_values(by=['fov', 'cell'])\n", + "df_sorted['cell_id'] = df_sorted.groupby('fov').cumcount() + 1\n", + "df_sorted['cell_id'] = df_sorted.apply(lambda x: f\"{x['fov']}_Cell_{x['cell_id']}\", axis=1)\n", + "df_sorted.to_csv(\"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/cells_df.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "8c9e51e0-b001-4b31-a6cb-d9a9c8f32eb4", + "metadata": { + "tags": [] + }, + "source": [ + "#### Break cell polygons df to patches (based on fov)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13e7bc10-1903-46ef-9042-9086b35259a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved patch_1.csv\n", + "Saved patch_2.csv\n", + "Saved patch_3.csv\n", + "Saved patch_4.csv\n", + "Saved patch_5.csv\n", + "Saved patch_6.csv\n", + "Saved patch_7.csv\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1563651/2577681905.py:3: FutureWarning: In a future version of pandas, a length 1 tuple will be returned when iterating over a groupby with a grouper equal to a list of length 1. Don't supply a list with a single grouper to avoid this warning.\n", + " for fov, group in grouped:\n" + ] + } + ], + "source": [ + "\n", + "# Create a df for each patch\n", + "grouped = df_sorted.groupby(['fov'])\n", + "for fov, group in grouped:\n", + " filename = f\"patch_{fov}.csv\"\n", + " output_loc = os.path.join(cells_df_chunks_dir, filename)\n", + " group.to_csv(output_loc)\n", + "\n", + " print(f\"Saved {filename}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "eb4bebd9-bc07-44da-a02f-28d5ddc3c1ed", + "metadata": { + "tags": [] + }, + "source": [ + "### Run ENACT bin-to-cell pipeline\n", + "In the configs.yaml file: \n", + "\n", + " Set \"analysis_name\" in the configs.yaml file to \"seqfish\".\n", + " Set \"run_synthetic\" to True.\n", + " Set \"bin_to_cell_method\" to one of these four: \"naive\", \"weighted_by_area\", \"weighted_by_gene\", or \"weighted_by_cluster\"\n", + "\n", + "Run `make run_enact`" + ] + }, + { + "cell_type": "markdown", + "id": "2a8fc042-2406-4db3-9617-7e3968ce8d28", + "metadata": { + "tags": [] + }, + "source": [ + "### Evaluation of ENACT bin-to-cell results" + ] + }, + { + "cell_type": "markdown", + "id": "01ff50d0-2993-42e9-98e9-fe478c32d605", + "metadata": {}, + "source": [ + "To evaluate and compare the four bin-to-cell methods, please first complete the step above with all four methods. You can also only run the methods you are interested in and change the following code accordingly." + ] + }, + { + "cell_type": "markdown", + "id": "1ef3fb7f-cc99-4f9e-b5cc-321412b08ddb", + "metadata": { + "tags": [] + }, + "source": [ + "#### Calculate precision, recall, and F1 for each bin2cell method" + ] + }, + { + "cell_type": "markdown", + "id": "4d11287c-611d-49d2-a1e6-e12c14a973f5", + "metadata": {}, + "source": [ + "Run this session with all the methods you have run with ENACT, change 'method' in the cell below to the one you want to evaluate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3c684f3-5b10-4bd2-8e1c-81d4cdb68ee4", + "metadata": {}, + "outputs": [], + "source": [ + "# Concatenate all patches of ENACT results file \n", + "method = \"weighted_by_gene\" # other methods: \"naive\", \"weighted_by_area\", \"weighted_by_cluster\" \n", + "directory_path = os.path.join(enact_data_dir,method,\"bin_to_cell_assign\") \n", + "output_file = os.path.join(enact_data_dir,method,\"bin_to_cell_assign/merged.csv\") \n", + "\n", + "concatenate_csv_files(directory_path, output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4580e62f-e2f3-4d1e-9a25-c483304a119e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "def concatenate_csv_files(directory_path, output_file):\n", + " dataframes = []\n", + "\n", + " for filename in os.listdir(directory_path):\n", + " if filename.endswith('.csv'):\n", + " file_path = os.path.join(directory_path, filename)\n", + " df = pd.read_csv(file_path)\n", + " dataframes.append(df)\n", + " \n", + " concatenated_df = pd.concat(dataframes, ignore_index=True)\n", + " concatenated_df = concatenated_df.drop(columns = ['Unnamed: 0.1','Unnamed: 0'])\n", + " sorted_df = concatenated_df.sort_values(by='id')\n", + " sorted_df.to_csv(output_file, index=False)\n", + " print(f\"All CSV files have been concatenated into {output_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "263c024a-821e-4d15-9abd-d3463a8e34f1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from shapely.geometry import Polygon\n", + "\n", + "def calculate_metrics(ground_truth_file, generated_file, eval_file):\n", + " # Load ground truth and generated data\n", + " ground_truth = pd.read_csv(ground_truth_file)\n", + " generated = pd.read_csv(generated_file)\n", + " generated.fillna(0)\n", + " # Ensure 'cell_id' is properly handled\n", + " if 'id' in generated.columns:\n", + " generated.rename(columns={'id': 'new_cell_name'}, inplace=True)\n", + "\n", + " # Merge data on 'cell_id'\n", + " merged = pd.merge(\n", + " ground_truth, generated, on='new_cell_name', how='outer', suffixes=('_gt', '_gen')\n", + " ).fillna(0)\n", + " # print(merged)\n", + "\n", + " # Identify common gene features\n", + " gt_columns = merged.filter(like='_gt').columns\n", + " gen_columns = merged.filter(like='_gen').columns\n", + "\n", + " common_genes = set(gt_columns).intersection(gen_columns)\n", + "\n", + " # Reorder columns based on common genes\n", + " ordered_gt_columns = sorted(gt_columns)\n", + " ordered_gen_columns = sorted(gen_columns)\n", + " \n", + "\n", + " # Extract aligned matrices for ground truth and generated data\n", + " ground_truth_aligned = merged[['new_cell_name'] + [col for col in ordered_gt_columns if col in gt_columns]].values\n", + " generated_aligned = merged[['new_cell_name'] + [col for col in ordered_gen_columns if col in gen_columns]].values\n", + " \n", + " print(ground_truth_aligned)\n", + " print(generated_aligned)\n", + " # Ensure matrices are aligned and have the same shape\n", + " if ground_truth_aligned.shape[1] != generated_aligned.shape[1]:\n", + " raise ValueError(\"The aligned matrices must have the same shape!\")\n", + "\n", + " ground_truth_aligned = ground_truth_aligned[:, 1:] # Exclude cell_ids\n", + " generated_aligned = generated_aligned[:, 1:] \n", + "\n", + " num_cells = (ground_truth.iloc[:, 1:] != 0).any(axis=1).sum()\n", + " tp = np.sum(np.minimum(generated_aligned, ground_truth_aligned), axis=1)\n", + " predicted = np.sum(generated_aligned, axis=1)\n", + " actual = np.sum(ground_truth_aligned, axis=1)\n", + "\n", + " # Calculate precision, recall, and F1 score for each row\n", + " precision = tp / predicted\n", + " recall = tp / actual\n", + " f1_score = 2 * (precision * recall) / (precision + recall)\n", + " \n", + "\n", + " # Add a column called 'Method' where all rows have the same entry\n", + " method_column = np.full((precision.shape[0],), 'Naive') # Replace 'YourMethodName' with the actual method name\n", + "\n", + " df = pd.DataFrame({\n", + " 'Precision': precision,\n", + " 'Recall': recall,\n", + " 'F1 Score': f1_score,\n", + " 'Method': method_column\n", + " })\n", + "\n", + "\n", + " df.to_csv(eval_file)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db707de6-8da9-495e-82a0-69c009cf1475", + "metadata": {}, + "outputs": [], + "source": [ + "ground_truth_file = os.path.join(seqfish_dir, \"groundtruth.csv\")\n", + "generated_file = os.path.join(enact_data_dir,method,\"bin_to_cell_assign/merged.csv\")\n", + "eval_file = os.path.join(enact_data_dir,method,\"eval.csv\") \n", + "\n", + "calculate_metrics(ground_truth_file, generated_file, eval_file)" + ] + }, + { + "cell_type": "markdown", + "id": "5cd9470e-8410-4510-9165-cebd466ab343", + "metadata": { + "tags": [] + }, + "source": [ + "#### Create violin plots comparing four bin2cell methods" + ] + }, + { + "cell_type": "markdown", + "id": "b78b6e7d-0e57-46fd-bddb-c701750a625b", + "metadata": {}, + "source": [ + "The following cells would create violin plots for all four methods in order to better compare the results. You can choose to only compare the ones you have run by changing the 'file_names' list to only include those." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dc3b4e5-798b-4d0f-a243-bc619daa6f50", + "metadata": {}, + "outputs": [], + "source": [ + "file_names = [os.path.join(enact_data_dir,\"naive/eval.csv\"), \n", + " os.path.join(enact_data_dir,\"weighted_by_area/eval.csv\"), \n", + " os.path.join(enact_data_dir,\"weighted_by_gene/eval.csv\"),\n", + " os.path.join(enact_data_dir,\"weighted_by_cluster/eval.csv\")] # Replace with actual file paths\n", + "\n", + "# Read and concatenate all files\n", + "df_list = [pd.read_csv(file) for file in file_names]\n", + "metrics_df = pd.concat(df_list, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a9f977f-6530-446a-9aa4-57d0cbbca85b", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize the distributions using violin plots\n", + "sns.set(style=\"whitegrid\")\n", + "\n", + "# Create a figure with subplots for each metric\n", + "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n", + "\n", + "# Precision Violin Plot\n", + "sns.violinplot(x='Method', y='Precision', data=metrics_df, ax=axes[0], inner='quartile', palette='Set2')\n", + "axes[0].set_title('Precision')\n", + "axes[0].set_xlabel('Method')\n", + "axes[0].set_ylabel('value')\n", + "axes[0].set_ylim(0.8,1)\n", + "axes[0].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", + "\n", + "# Recall Violin Plot\n", + "sns.violinplot(x='Method', y='Recall', data=metrics_df, ax=axes[1], inner='quartile', palette='Set2')\n", + "axes[1].set_title('Recall')\n", + "axes[1].set_xlabel('Method')\n", + "axes[1].set_ylabel('value')\n", + "axes[1].set_ylim(0.8,1)\n", + "axes[1].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", + "\n", + "# F1 Score Violin Plot\n", + "sns.violinplot(x='Method', y='F1 Score', data=metrics_df, ax=axes[2], inner='quartile', palette='Set2')\n", + "axes[2].set_title('F1 Score')\n", + "axes[2].set_xlabel('Method')\n", + "axes[2].set_ylabel('value')\n", + "axes[2].set_ylim(0.8,1)\n", + "axes[2].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/utils/logging.py b/src/utils/logging.py new file mode 100644 index 0000000..5a804a5 --- /dev/null +++ b/src/utils/logging.py @@ -0,0 +1,22 @@ +""" +Created By : ... +Created Date: DD/MM/YYYY +Description : ... +""" +import logging + + +def get_logger(app_name): + """Create and configure logger. + + :return: logger + :rtype: Logger + """ + logger = logging.getLogger(app_name) + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler() + handler.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger