diff --git a/.gitignore b/.gitignore index 2ff834f..53792b2 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ *pycache* .DS_Store *.pyc -venv/* \ No newline at end of file +venv/* +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 4a7111e..16e303f 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ On a typical Windows, Mac, or Linux computer: * Create a conda environment: `conda create --name multisero python=3.7` * Activate conda environment: `conda activate multisero` * Pip install dependencies: `pip install -r requirements.txt` -* Add the package to PYTHONPATH. Inside the package directory (...\serology-COVID19), do: `export PYTHONPATH=$PYTHONPATH:$(pwd)` +* If operating from CLI, add the package to PYTHONPATH by navigating to package directory (eg: ...\serology-COVID19), and type `export PYTHONPATH=$PYTHONPATH:$(pwd)` For installation notes for Jetson Nano, see [these notes](docs/installation.md). @@ -47,10 +47,12 @@ optional arguments: -m METADATA, --metadata METADATA specify the file name for the experiment metadata. Assumed to be in the same directory as images. - Default: 'multisero_output_data_metadata.xlsx' + Default: 'multisero_output_data_metadata.xlsx', therefore + metadata flag is NOT necessary when running in -e mode IF + metadata file has the default filename. ``` ### Extract OD from antigen array images -`python multisero.py -e -i -o -m ` will take metadata for antigen array and images as input, and output optical densities for each antigen. +`python multisero.py -e -i -o ` will take metadata for antigen array and images as input, and output optical densities for each antigen. The optical densities are stored in an excel file at the following path: `/multisero___/median_ODs.xlsx` If rerunning some of the wells, the input metadata file needs to contain a sheet named 'rerun_wells' diff --git a/interpretation/od_analyzer.py b/interpretation/od_analyzer.py deleted file mode 100644 index 9e54b57..0000000 --- a/interpretation/od_analyzer.py +++ /dev/null @@ -1,160 +0,0 @@ -import pandas as pd -import numpy as np -import os -import re -from matplotlib import pyplot as plt -import seaborn as sns -from interpretation.plotting import roc_plot_grid, total_plots -from interpretation.report_reader import slice_df, normalize_od, read_output_batch -import array_analyzer.extract.constants as constants - - -def read_config(input_dir): - """ - Load analysis config from the input directory. - :param str input_dir: input directory - :return dataframe ntl_dirs_df: 'multisero output dirs' tab in the config file. - :return dataframe scn_scn_df: 'scienion output dirs' tab in the config file. - :return dataframe plot_setting_df: 'general plotting settings' tab in the config file. - :return dataframe roc_param_df: 'ROC plot' tab in the config file. - :return dataframe cat_param_df: 'categorical plot' tab in the config file. - :return dataframe fit_param_df: 'standard curves' tab in the config file. - """ - config = dict() - config_path = os.path.join(input_dir, constants.METADATA_FILE) - if constants.METADATA_FILE not in os.listdir(input_dir): - raise IOError("analysis config file not found, aborting") - # check that the xlsx file contains necessary worksheets - ntl_dirs_df = scn_scn_df = roc_param_df = cat_param_df = fit_param_df = pd.DataFrame() - with pd.ExcelFile(config_path) as config_file: - plot_setting_df = pd.read_excel(config_file, sheet_name='general plotting settings', - index_col=0, squeeze=True, usecols='A,B') - plot_setting_df.where(plot_setting_df.notnull(), None, inplace=True) - if 'ROC plot' in config_file.sheet_names: - roc_param_df = pd.read_excel(config_file, sheet_name='ROC plot', - index_col=0, squeeze=True, usecols='A,B') - # replace NaN with None - roc_param_df.where(roc_param_df.notnull(), None, inplace=True) - if roc_param_df['serum ID'] is not None: - roc_param_df['serum ID'] = re.split(r'\s*,\s*', roc_param_df['serum ID']) - if 'categorical plot' in config_file.sheet_names: - cat_param_df = pd.read_excel(config_file, sheet_name='categorical plot', - index_col=0, squeeze=True, usecols='A,B') - cat_param_df.where(cat_param_df.notnull(), None, inplace=True) - if cat_param_df['serum ID'] is not None: - cat_param_df['serum ID'] = re.split(r'\s*,\s*', cat_param_df['serum ID']) - if 'standard curves' in config_file.sheet_names: - fit_param_df = pd.read_excel(config_file, sheet_name='standard curves', - index_col=0, squeeze=True, usecols='A,B') - fit_param_df.where(fit_param_df.notnull(), None, inplace=True) - if fit_param_df['serum ID'] is not None: - fit_param_df['serum ID'] = re.split(r'\s*,\s*', fit_param_df['serum ID']) - if not constants.LOAD_REPORT: - assert ('multisero output dirs' in config_file.sheet_names) or \ - ('scienion output dirs' in config_file.sheet_names), \ - "sheet by name 'multisero output dirs' or 'scienion output dirs' are required " \ - "in analysis config file when load_report is False, aborting" - if 'multisero output dirs' in config_file.sheet_names: - ntl_dirs_df = pd.read_excel(config_file, sheet_name='multisero output dirs', comment='#') - if not ntl_dirs_df.isna().loc[0, 'well ID']: - ntl_dirs_df['well ID'] = ntl_dirs_df['well ID'].str.split(pat=r'\s*,\s*') - if 'scienion output dirs' in config_file.sheet_names: - scn_scn_df = pd.read_excel(config_file, sheet_name='scienion output dirs', comment='#') - return ntl_dirs_df, scn_scn_df, plot_setting_df, roc_param_df, cat_param_df, fit_param_df - -def analyze_od(input_dir, output_dir, load_report): - """ - Perform analysis on multisero or scienion OD outputs specified in the config files. - Save the combined table as 'master report' in the output directory. - :param str input_dir: Input directory - :param str output_dir: Output directory - :param bool load_report: If True, load the saved 'master report' in the output directory - from the previous run. Load from the master report is much faster. - """ - os.makedirs(output_dir, exist_ok=True) - ntl_dirs_df, scn_scn_df, plot_setting_df, roc_param_df, cat_param_df, fit_param_df =\ - read_config(input_dir) - stitched_multisero_df = read_output_batch(output_dir, ntl_dirs_df, scn_scn_df, load_report) - if plot_setting_df['antigens to plot'] == 'all': - plot_setting_df['antigens to plot'] = stitched_multisero_df['antigen'].unique() - split_plots_by = plot_setting_df['split plots by'] - split_plots_vals = [None] - if split_plots_by is not None: - split_plots_vals = stitched_multisero_df[split_plots_by].unique() - norm_antigen = plot_setting_df['normalize OD by'] - norm_group = 'plate' - aggregate = 'mean' - # aggregate = None - antigen_list = plot_setting_df['antigens to plot'] - - suffix = '' - df_norm = normalize_od(stitched_multisero_df.copy(), norm_antigen, group=norm_group) - if norm_antigen is not None: - suffix = '_'.join([suffix, norm_antigen, 'norm_per', norm_group]) - - if aggregate is not None: - df_norm = df_norm.groupby(['antigen', 'antigen type', 'serum ID', 'well_id', 'plate ID', 'sample type', - 'serum type', 'serum dilution', 'serum cat', 'pipeline', 'secondary ID', - 'secondary dilution','PRNT'])['OD'].mean().reset_index() - suffix = '_'.join([suffix, aggregate]) - - for split_val in split_plots_vals: - split_suffix = suffix - if split_val is not None: - split_suffix = '_'.join([split_suffix, split_val]) - df_norm_sub = slice_df(df_norm, 'keep', split_plots_by, split_val) - slice_cols = [split_plots_by, 'antigen type', 'antigen'] - slice_keys = [[split_val], ['Diagnostic','Positive'], antigen_list] - #slice_keys = [[split_val], ['Negative'], antigen_list] - slice_actions = ['keep', 'keep', 'keep'] - # general slicing - for col, action, key in zip(slice_cols, slice_actions, slice_keys): - df_norm_sub = slice_df(df_norm_sub, action, col, key) - #%% compute ROC curves and AUC - if not roc_param_df.empty: - sera_roc_list = roc_param_df['serum ID'] - slice_action = roc_param_df['serum ID action'] - # plot specific slicing - roc_df = slice_df(df_norm_sub, slice_action, 'serum ID', sera_roc_list) - fpr = 1 - roc_param_df['specificity'] - ci = roc_param_df['confidence interval'] - hue = roc_param_df['hue'] - # df_norm = offset_od(df_norm, offset_antigen, offset_group) - roc_suffix = split_suffix - if ci is not None: - roc_suffix = '_'.join([roc_suffix, 'ci']) - #%% - print('{} unique positive sera'.format(len(roc_df.loc[roc_df['serum type']=='positive', 'serum ID'].unique()))) - print('{} unique negative sera'.format(len(roc_df.loc[roc_df['serum type'] == 'negative', 'serum ID'].unique()))) - roc_plot_grid(roc_df, constants.RUN_PATH, '_'.join(['ROC', roc_suffix]), 'png', ci=ci, fpr=fpr, hue=hue) - #%% Plot categorical scatter plot for episurvey - if not cat_param_df.empty: - sera_cat_list = cat_param_df['serum ID'] - slice_action = cat_param_df['serum ID action'] - split_subplots_by = cat_param_df['split subplots by'] - hue = cat_param_df['hue'] - # plot specific slicing - cat_df = slice_df(df_norm_sub, slice_action, 'serum ID', sera_cat_list) #serum ID --> antigen - assert not cat_df.empty, 'Plotting dataframe is empty. Please check the plotting keys' - sns.set_context("talk") - g = sns.catplot(x="serum type", y="OD", hue=hue, col=split_subplots_by, kind="swarm", - data=cat_df, col_wrap=3) - g.set_xticklabels(rotation=65, horizontalalignment='right') - - - plt.savefig(os.path.join(constants.RUN_PATH, 'catplot_{}.png'.format(split_suffix)), - dpi=300, bbox_inches='tight') - if cat_param_df['zoom']: - g.set(ylim=(-0.05, 0.4)) - plt.savefig(os.path.join(constants.RUN_PATH, 'catplot_zoom_{}.png'.format(split_suffix)), - dpi=300, bbox_inches='tight') - #%% 4PL fit - if not fit_param_df.empty: - slice_action = fit_param_df['serum ID action'] - hue = fit_param_df['hue'] - dilution_df = slice_df(df_norm_sub, slice_action, 'serum ID', fit_param_df['serum ID']) - split_subplots_by = fit_param_df['split subplots by'] - total_plots(dilution_df, constants.RUN_PATH, 'fit_{}'.format(split_suffix), 'png', hue=hue, - zoom=fit_param_df['zoom'], split_subplots_by=split_subplots_by, col_wrap=2) - plt.close('all') - diff --git a/array_analyzer/__init__.py b/multiSero/__init__.py similarity index 100% rename from array_analyzer/__init__.py rename to multiSero/__init__.py diff --git a/multiSero/array_analyzer/__init__.py b/multiSero/array_analyzer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/array_analyzer/extract/__init__.py b/multiSero/array_analyzer/extract/__init__.py similarity index 100% rename from array_analyzer/extract/__init__.py rename to multiSero/array_analyzer/extract/__init__.py diff --git a/array_analyzer/extract/background_estimator.py b/multiSero/array_analyzer/extract/background_estimator.py similarity index 100% rename from array_analyzer/extract/background_estimator.py rename to multiSero/array_analyzer/extract/background_estimator.py diff --git a/array_analyzer/extract/constants.py b/multiSero/array_analyzer/extract/constants.py similarity index 100% rename from array_analyzer/extract/constants.py rename to multiSero/array_analyzer/extract/constants.py diff --git a/array_analyzer/extract/image_parser.py b/multiSero/array_analyzer/extract/image_parser.py similarity index 99% rename from array_analyzer/extract/image_parser.py rename to multiSero/array_analyzer/extract/image_parser.py index 1cb6bd2..927bbe6 100644 --- a/array_analyzer/extract/image_parser.py +++ b/multiSero/array_analyzer/extract/image_parser.py @@ -12,7 +12,7 @@ from skimage import measure from .img_processing import thresh_and_binarize -from array_analyzer.transform.point_registration import icp +from multiSero.array_analyzer.transform.point_registration import icp """ method is @@ -338,7 +338,7 @@ def find_fiducials_markers(props_, return cent_map -def grid_from_centroids(props_, n_rows, n_cols, grid_spacing=82): +def grid_from_centroids(props_, n_rows, n_cols, grid_spacing=90): """ based on the region props, creates a dictionary of format: key = (centroid_x, centroid_y) diff --git a/array_analyzer/extract/img_processing.py b/multiSero/array_analyzer/extract/img_processing.py similarity index 100% rename from array_analyzer/extract/img_processing.py rename to multiSero/array_analyzer/extract/img_processing.py diff --git a/array_analyzer/extract/metadata.py b/multiSero/array_analyzer/extract/metadata.py similarity index 98% rename from array_analyzer/extract/metadata.py rename to multiSero/array_analyzer/extract/metadata.py index c474389..55a6eaa 100644 --- a/array_analyzer/extract/metadata.py +++ b/multiSero/array_analyzer/extract/metadata.py @@ -3,8 +3,8 @@ import pandas as pd import shutil -import array_analyzer.extract.txt_parser as txt_parser -import array_analyzer.extract.constants as constants +import multiSero.array_analyzer.extract.txt_parser as txt_parser +import multiSero.array_analyzer.extract.constants as constants class MetaData: diff --git a/array_analyzer/extract/notebooks/Array_analyzer_bbox.ipynb b/multiSero/array_analyzer/extract/notebooks/Array_analyzer_bbox.ipynb similarity index 100% rename from array_analyzer/extract/notebooks/Array_analyzer_bbox.ipynb rename to multiSero/array_analyzer/extract/notebooks/Array_analyzer_bbox.ipynb diff --git a/array_analyzer/extract/notebooks/image_parser_nb.ipynb b/multiSero/array_analyzer/extract/notebooks/image_parser_nb.ipynb similarity index 100% rename from array_analyzer/extract/notebooks/image_parser_nb.ipynb rename to multiSero/array_analyzer/extract/notebooks/image_parser_nb.ipynb diff --git a/array_analyzer/extract/notebooks/image_parser_well_mask.ipynb b/multiSero/array_analyzer/extract/notebooks/image_parser_well_mask.ipynb similarity index 100% rename from array_analyzer/extract/notebooks/image_parser_well_mask.ipynb rename to multiSero/array_analyzer/extract/notebooks/image_parser_well_mask.ipynb diff --git a/array_analyzer/extract/notebooks/spot_detection_icp.ipynb b/multiSero/array_analyzer/extract/notebooks/spot_detection_icp.ipynb similarity index 100% rename from array_analyzer/extract/notebooks/spot_detection_icp.ipynb rename to multiSero/array_analyzer/extract/notebooks/spot_detection_icp.ipynb diff --git a/array_analyzer/extract/notebooks/txt_parser_nb.ipynb b/multiSero/array_analyzer/extract/notebooks/txt_parser_nb.ipynb similarity index 100% rename from array_analyzer/extract/notebooks/txt_parser_nb.ipynb rename to multiSero/array_analyzer/extract/notebooks/txt_parser_nb.ipynb diff --git a/array_analyzer/extract/txt_parser.py b/multiSero/array_analyzer/extract/txt_parser.py similarity index 99% rename from array_analyzer/extract/txt_parser.py rename to multiSero/array_analyzer/extract/txt_parser.py index 2466925..982d574 100644 --- a/array_analyzer/extract/txt_parser.py +++ b/multiSero/array_analyzer/extract/txt_parser.py @@ -9,8 +9,6 @@ import pandas as pd import math -import array_analyzer.extract.constants as constants - """ functions like "create__dict" parse files of and return: fiduc: list of dict describing fiducial positions diff --git a/array_analyzer/load/__init__.py b/multiSero/array_analyzer/load/__init__.py similarity index 100% rename from array_analyzer/load/__init__.py rename to multiSero/array_analyzer/load/__init__.py diff --git a/array_analyzer/load/debug_plots.py b/multiSero/array_analyzer/load/debug_plots.py similarity index 100% rename from array_analyzer/load/debug_plots.py rename to multiSero/array_analyzer/load/debug_plots.py diff --git a/array_analyzer/load/report.py b/multiSero/array_analyzer/load/report.py similarity index 99% rename from array_analyzer/load/report.py rename to multiSero/array_analyzer/load/report.py index 10318e7..ca9dcbe 100644 --- a/array_analyzer/load/report.py +++ b/multiSero/array_analyzer/load/report.py @@ -5,7 +5,7 @@ import os import pandas as pd -import array_analyzer.extract.constants as constants +import multiSero.array_analyzer.extract.constants as constants class ReportWriter: diff --git a/array_analyzer/transform/__init__.py b/multiSero/array_analyzer/transform/__init__.py similarity index 100% rename from array_analyzer/transform/__init__.py rename to multiSero/array_analyzer/transform/__init__.py diff --git a/array_analyzer/transform/array_generation.py b/multiSero/array_analyzer/transform/array_generation.py similarity index 94% rename from array_analyzer/transform/array_generation.py rename to multiSero/array_analyzer/transform/array_generation.py index 2ac4fab..819a0e8 100644 --- a/array_analyzer/transform/array_generation.py +++ b/multiSero/array_analyzer/transform/array_generation.py @@ -2,10 +2,10 @@ import numpy as np import pandas as pd -import array_analyzer.extract.constants as constants -import array_analyzer.extract.img_processing as img_processing -import array_analyzer.extract.txt_parser as txt_parser -import array_analyzer.utils.spot_regionprop as regionprop +import multiSero.array_analyzer.extract.constants as constants +import multiSero.array_analyzer.extract.img_processing as img_processing +import multiSero.array_analyzer.extract.txt_parser as txt_parser +import multiSero.array_analyzer.utils.spot_regionprop as regionprop def build_centroid_binary_blocks(cent_list, image_, params_, return_type='region'): @@ -59,7 +59,7 @@ def build_centroid_binary_blocks(cent_list, image_, params_, return_type='region return target -def get_spot_intensity(coords, im, background, search_range=3): +def get_spot_intensity(coords, im, background, search_range=2): """ Extract signal and background intensity at each spot given the spot coordinate with the following steps: diff --git a/array_analyzer/transform/point_registration.py b/multiSero/array_analyzer/transform/point_registration.py similarity index 98% rename from array_analyzer/transform/point_registration.py rename to multiSero/array_analyzer/transform/point_registration.py index dd933bf..e95186e 100644 --- a/array_analyzer/transform/point_registration.py +++ b/multiSero/array_analyzer/transform/point_registration.py @@ -2,7 +2,7 @@ import logging import numpy as np -import array_analyzer.extract.constants as constants +import multiSero.array_analyzer.extract.constants as constants def icp(source, target, max_iterate=50, matrix_diff=1.): @@ -166,10 +166,12 @@ def get_translation_matrix(particle): b = particle[3] * np.sin(particle[2] * np.pi / 180) t_matrix = np.array([[a, b, particle[0]], [-b, a, particle[1]]]) + #t_matrix = np.array([[0, 0, particle[0]], + #[0, 0, particle[1]]]) return t_matrix def particle_filter(self, - max_iter=100, + max_iter=1000, stop_criteria=.1, iter_decrease=.8, nbr_outliers=0): @@ -256,6 +258,7 @@ def particle_filter(self, self.registered_dist = min_dist / (self.fiducial_coords.shape[0] - nbr_outliers) self.logger.info("Particle filter min dist: {}".format(self.registered_dist)) if self.registered_dist > constants.REG_DIST_THRESH: + #if self.registered_dist > 300: self.registration_ok = False else: self.registration_ok = True diff --git a/array_analyzer/utils/__init__.py b/multiSero/array_analyzer/utils/__init__.py similarity index 100% rename from array_analyzer/utils/__init__.py rename to multiSero/array_analyzer/utils/__init__.py diff --git a/array_analyzer/utils/example_id2spots_AEPFeb3.csv b/multiSero/array_analyzer/utils/example_id2spots_AEPFeb3.csv similarity index 100% rename from array_analyzer/utils/example_id2spots_AEPFeb3.csv rename to multiSero/array_analyzer/utils/example_id2spots_AEPFeb3.csv diff --git a/array_analyzer/utils/io_utils.py b/multiSero/array_analyzer/utils/io_utils.py similarity index 100% rename from array_analyzer/utils/io_utils.py rename to multiSero/array_analyzer/utils/io_utils.py diff --git a/array_analyzer/utils/spot_regionprop.py b/multiSero/array_analyzer/utils/spot_regionprop.py similarity index 98% rename from array_analyzer/utils/spot_regionprop.py rename to multiSero/array_analyzer/utils/spot_regionprop.py index 5fed3fa..f466962 100644 --- a/array_analyzer/utils/spot_regionprop.py +++ b/multiSero/array_analyzer/utils/spot_regionprop.py @@ -3,7 +3,7 @@ from skimage import measure from skimage.morphology import disk -import array_analyzer.extract.constants as constants +import multiSero.array_analyzer.extract.constants as constants class SpotRegionprop: diff --git a/array_analyzer/utils/visualize_elisa_spots.py b/multiSero/array_analyzer/utils/visualize_elisa_spots.py similarity index 100% rename from array_analyzer/utils/visualize_elisa_spots.py rename to multiSero/array_analyzer/utils/visualize_elisa_spots.py diff --git a/multiSero/array_analyzer/workflows/__init__.py b/multiSero/array_analyzer/workflows/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/array_analyzer/workflows/interpolation_wf.py b/multiSero/array_analyzer/workflows/interpolation_wf.py similarity index 87% rename from array_analyzer/workflows/interpolation_wf.py rename to multiSero/array_analyzer/workflows/interpolation_wf.py index e3153f7..19ba32a 100644 --- a/array_analyzer/workflows/interpolation_wf.py +++ b/multiSero/array_analyzer/workflows/interpolation_wf.py @@ -4,16 +4,16 @@ import pandas as pd import skimage.io as io -import array_analyzer.extract.image_parser as image_parser -import array_analyzer.extract.txt_parser as txt_parser -import array_analyzer.extract.img_processing as img_processing -import array_analyzer.load.debug_plots as debug_plots -import array_analyzer.load.report as report -import array_analyzer.extract.constants as constants -import array_analyzer.transform.array_generation as array_gen -import array_analyzer.extract.background_estimator as background_estimator -import array_analyzer.utils.io_utils as io_utils -from array_analyzer.extract.metadata import MetaData +import multiSero.array_analyzer.extract.image_parser as image_parser +import multiSero.array_analyzer.extract.txt_parser as txt_parser +import multiSero.array_analyzer.extract.img_processing as img_processing +import multiSero.array_analyzer.load.debug_plots as debug_plots +import multiSero.array_analyzer.load.report as report +import multiSero.array_analyzer.extract.constants as constants +import multiSero.array_analyzer.transform.array_generation as array_gen +import multiSero.array_analyzer.extract.background_estimator as background_estimator +import multiSero.array_analyzer.utils.io_utils as io_utils +from multiSero.array_analyzer.extract.metadata import MetaData def interp(input_dir, output_dir): diff --git a/array_analyzer/workflows/registration_workflow.py b/multiSero/array_analyzer/workflows/registration_workflow.py similarity index 91% rename from array_analyzer/workflows/registration_workflow.py rename to multiSero/array_analyzer/workflows/registration_workflow.py index 708894b..6825cd5 100644 --- a/array_analyzer/workflows/registration_workflow.py +++ b/multiSero/array_analyzer/workflows/registration_workflow.py @@ -1,21 +1,18 @@ -import cv2 as cv import logging -import numpy as np import os import pandas as pd import time -import array_analyzer.extract.background_estimator as background_estimator -import array_analyzer.extract.image_parser as image_parser -import array_analyzer.extract.img_processing as img_processing -import array_analyzer.extract.metadata as metadata -import array_analyzer.extract.txt_parser as txt_parser -import array_analyzer.extract.constants as constants -import array_analyzer.load.debug_plots as debug_plots -import array_analyzer.load.report as report -import array_analyzer.transform.point_registration as registration -import array_analyzer.transform.array_generation as array_gen -import array_analyzer.utils.io_utils as io_utils +import multiSero.array_analyzer.extract.background_estimator as background_estimator +import multiSero.array_analyzer.extract.img_processing as img_processing +import multiSero.array_analyzer.extract.metadata as metadata +import multiSero.array_analyzer.extract.txt_parser as txt_parser +import multiSero.array_analyzer.extract.constants as constants +import multiSero.array_analyzer.load.debug_plots as debug_plots +import multiSero.array_analyzer.load.report as report +import multiSero.array_analyzer.transform.point_registration as registration +import multiSero.array_analyzer.transform.array_generation as array_gen +import multiSero.array_analyzer.utils.io_utils as io_utils def point_registration(input_dir, output_dir): diff --git a/array_analyzer/workflows/well_wf.py b/multiSero/array_analyzer/workflows/well_wf.py similarity index 92% rename from array_analyzer/workflows/well_wf.py rename to multiSero/array_analyzer/workflows/well_wf.py index 075f9ae..47577b1 100644 --- a/array_analyzer/workflows/well_wf.py +++ b/multiSero/array_analyzer/workflows/well_wf.py @@ -1,15 +1,14 @@ -import array_analyzer.extract.image_parser as image_parser -import array_analyzer.extract.img_processing as processing -import array_analyzer.extract.constants as constants -from array_analyzer.extract.metadata import MetaData -import array_analyzer.utils.io_utils as io_utils +import multiSero.array_analyzer.extract.image_parser as image_parser +import multiSero.array_analyzer.extract.img_processing as processing +import multiSero.array_analyzer.extract.constants as constants +from multiSero.array_analyzer.extract.metadata import MetaData +import multiSero.array_analyzer.utils.io_utils as io_utils import time import skimage.io as io import pandas as pd import string import os -import re import numpy as np diff --git a/multiSero/interpretation/od_analyzer.py b/multiSero/interpretation/od_analyzer.py new file mode 100644 index 0000000..d82b966 --- /dev/null +++ b/multiSero/interpretation/od_analyzer.py @@ -0,0 +1,276 @@ +import pandas as pd +import os +import re +from matplotlib import pyplot as plt +import seaborn as sns + +from multiSero.plotting.plotting import roc_plot_grid +from multiSero.interpretation.report_reader import slice_df, normalize_od, read_output_batch +import multiSero.array_analyzer.extract.constants as constants + + +def read_config(input_dir): + """ + Load analysis config from the input directory. + :param str input_dir: input directory + :return dataframe ntl_dirs_df: 'multisero output dirs' tab in the config file. + :return dataframe scn_scn_df: 'scienion output dirs' tab in the config file. + :return dataframe plot_setting_df: 'general plotting settings' tab in the config file. + :return dataframe roc_param_df: 'ROC plot' tab in the config file. + :return dataframe cat_param_df: 'categorical plot' tab in the config file. + :return dataframe fit_param_df: 'standard curves' tab in the config file. + """ + config = dict() + config_path = os.path.join(input_dir, constants.METADATA_FILE) + if constants.METADATA_FILE not in os.listdir(input_dir): + raise IOError("analysis config file not found, aborting") + # check that the xlsx file contains necessary worksheets + ntl_dirs_df = scn_scn_df = roc_param_df = cat_param_df = fit_param_df = pd.DataFrame() + with pd.ExcelFile(config_path) as config_file: + plot_setting_df = pd.read_excel(config_file, sheet_name='general plotting settings', + index_col=0, squeeze=True, usecols='A,B') + plot_setting_df.where(plot_setting_df.notnull(), None, inplace=True) + if 'ROC plot' in config_file.sheet_names: + roc_param_df = pd.read_excel(config_file, sheet_name='ROC plot', + index_col=0, squeeze=True, usecols='A,B') + # replace NaN with None + roc_param_df.where(roc_param_df.notnull(), None, inplace=True) + if roc_param_df['serum ID'] is not None: + roc_param_df['serum ID'] = re.split(r'\s*,\s*', roc_param_df['serum ID']) + if 'categorical plot' in config_file.sheet_names: + cat_param_df = pd.read_excel(config_file, sheet_name='categorical plot', + index_col=0, squeeze=True, usecols='A,B') + cat_param_df.where(cat_param_df.notnull(), None, inplace=True) + if cat_param_df['serum ID'] is not None: + cat_param_df['serum ID'] = re.split(r'\s*,\s*', cat_param_df['serum ID']) + if 'standard curves' in config_file.sheet_names: + fit_param_df = pd.read_excel(config_file, sheet_name='standard curves', + index_col=0, squeeze=True, usecols='A,B') + fit_param_df.where(fit_param_df.notnull(), None, inplace=True) + if fit_param_df['serum ID'] is not None: + fit_param_df['serum ID'] = re.split(r'\s*,\s*', fit_param_df['serum ID']) + if not constants.LOAD_REPORT: + assert ('multisero output dirs' in config_file.sheet_names) or \ + ('scienion output dirs' in config_file.sheet_names), \ + "sheet by name 'multisero output dirs' or 'scienion output dirs' are required " \ + "in analysis config file when load_report is False, aborting" + if 'multisero output dirs' in config_file.sheet_names: + ntl_dirs_df = pd.read_excel(config_file, sheet_name='multisero output dirs', comment='#') + if not ntl_dirs_df.isna().loc[0, 'well ID']: + ntl_dirs_df['well ID'] = ntl_dirs_df['well ID'].str.split(pat=r'\s*,\s*') + if 'scienion output dirs' in config_file.sheet_names: + scn_scn_df = pd.read_excel(config_file, sheet_name='scienion output dirs', comment='#') + return ntl_dirs_df, scn_scn_df, plot_setting_df, roc_param_df, cat_param_df, fit_param_df + +def analyze_od(input_dir, output_dir, load_report): + """ + Perform analysis on multisero or scienion OD outputs specified in the config files. + Save the combined table as 'master report' in the output directory. + :param str input_dir: Input directory + :param str output_dir: Output directory + :param bool load_report: If True, load the saved 'master report' in the output directory + from the previous run. Load from the master report is much faster. + """ + os.makedirs(output_dir, exist_ok=True) + ntl_dirs_df, scn_scn_df, plot_setting_df, roc_param_df, cat_param_df, fit_param_df =\ + read_config(input_dir) + stitched_multisero_df = read_output_batch(output_dir, ntl_dirs_df, scn_scn_df, load_report) + if plot_setting_df['antigens to plot'] == 'all': + plot_setting_df['antigens to plot'] = stitched_multisero_df['antigen'].unique() + split_plots_by = plot_setting_df['split plots by'] + split_plots_vals = [None] + if split_plots_by is not None: + split_plots_vals = stitched_multisero_df[split_plots_by].unique() + norm_antigen = plot_setting_df['normalize OD by'] + norm_group = 'plate' + aggregate = 'mean' + # aggregate = None + antigen_list = [plot_setting_df['antigens to plot'], 'SARS CoV2 RBD 250', 'SARS CoV2 spike 62.5'] + + suffix = '' + df_norm = normalize_od(stitched_multisero_df.copy(), norm_antigen, group=norm_group) + if norm_antigen is not None: + suffix = '_'.join([suffix, norm_antigen, 'norm_per', norm_group]) + + if aggregate is not None: + df_norm = df_norm.groupby(['antigen', 'antigen type', 'serum ID', 'well_id', 'plate ID', 'sample type', + 'serum type', 'serum dilution', 'serum cat', 'pipeline', 'secondary ID', + 'secondary dilution', 'visit value'])['OD'].mean().reset_index() + suffix = '_'.join([suffix, aggregate]) + + for split_val in split_plots_vals: + split_suffix = suffix + if split_val is not None: + split_suffix = '_'.join([split_suffix, split_val]) + df_norm_sub = slice_df(df_norm, 'keep', split_plots_by, split_val) + slice_cols = [split_plots_by, 'antigen type', 'antigen'] + slice_keys = [[split_val], ['Diagnostic'], antigen_list] # might be useful to eventually select for in UI + #slice_keys = [[split_val], ['Negative'], antigen_list] + slice_actions = ['keep', 'keep', 'keep'] + # general slicing + for col, action, key in zip(slice_cols, slice_actions, slice_keys): + df_norm_sub = slice_df(df_norm_sub, action, col, key) + + # %% compute ROC curves and AUC + if not roc_param_df.empty: + sera_roc_list = roc_param_df['serum ID'] + slice_action = roc_param_df['serum ID action'] + # plot specific slicing + roc_df = slice_df(df_norm_sub, slice_action, 'serum ID', sera_roc_list) + fpr = 1 - roc_param_df['specificity'] + ci = roc_param_df['confidence interval'] + hue = roc_param_df['hue'] + # df_norm = offset_od(df_norm, offset_antigen, offset_group) + roc_suffix = split_suffix + if ci is not None: + roc_suffix = '_'.join([roc_suffix, 'ci']) + #%% + print('{} unique positive sera'.format(len(roc_df.loc[roc_df['serum type']=='positive', 'serum ID'].unique()))) + print('{} unique negative sera'.format(len(roc_df.loc[roc_df['serum type'] == 'negative', 'serum ID'].unique()))) + roc_plot_grid(roc_df, constants.RUN_PATH, '_'.join(['ROC', roc_suffix]), 'png', ci=ci, fpr=fpr, hue=hue) + #%% Plot categorical scatter plot for episurvey -- FIGURE 5 + if not cat_param_df.empty: + # ms_tidy_df is created to create a tidy, concise, and contained dataframe for cat plotting (Fig 5a) + ms_tidy_df = df_norm_sub[ + ['antigen', 'OD', 'visit value', 'serum cat', 'serum type', 'serum ID', 'serum dilution']] + ms_tidy_df.drop(ms_tidy_df.loc[ms_tidy_df['serum dilution'] > 0.00005].index, inplace=True) + ms_tidy_df.drop(ms_tidy_df.loc[(ms_tidy_df['serum cat'] != '[\'COVID-Vax+\']') + & (ms_tidy_df['serum cat'] != '[\'COVID+Vax+\']') + & (ms_tidy_df['serum cat'] != '[\'COVID+Vax-\']')].index, inplace=True) + + group = ms_tidy_df.groupby( + 'serum type') # group data by patient ID and transfrom 'visit value' numbers into categorical values + df2 = group.apply(lambda x: x['visit value'].unique()) + for element in df2.index: + if (len(df2[element]) > 1): + if (df2[element][0][2:-2] > df2[element][1][2:-2]): + ms_tidy_df.loc[ + ms_tidy_df['visit value'] == df2[element][0], 'vaccine availability'] = 'post-vax' + ms_tidy_df.loc[ms_tidy_df['visit value'] == df2[element][1], 'vaccine availability'] = 'pre-vax' + if (df2[element][0][2:-2] < df2[element][1][2:-2]): + ms_tidy_df.loc[ms_tidy_df['visit value'] == df2[element][0], 'vaccine availability'] = 'pre-vax' + ms_tidy_df.loc[ + ms_tidy_df['visit value'] == df2[element][1], 'vaccine availability'] = 'post-vax' + # else: + # print(element) + + # delta_df serves a similar purpose to ms_tidy_df but contains the data for Fig 5b + prevaxdf = ms_tidy_df.loc[ms_tidy_df['vaccine availability'] == 'pre-vax'] + prevaxdf['old OD'] = prevaxdf['OD'] + # prevaxdf.set_index(['serum type', 'antigen'], inplace=True) + postvaxdf = ms_tidy_df.loc[ms_tidy_df['vaccine availability'] == 'post-vax'] + postvaxdf['new OD'] = postvaxdf['OD'] + # prevaxdf.set_index(['serum type', 'antigen'], inplace=True) + delta_df = pd.merge(prevaxdf, postvaxdf, how='inner', on=['antigen', 'serum type']) + delta_df['delta OD'] = delta_df['new OD'] - delta_df['old OD'] + # extract params from cat_param_df + sera_cat_list = cat_param_df['serum ID'] + slice_action = cat_param_df['serum ID action'] + split_subplots_by = cat_param_df['split subplots by'] + hue = cat_param_df['hue'] + # plot specific slicing + # cat_df = slice_df(df_norm_sub, slice_action, 'serum ID', sera_cat_list) #serum ID --> antigen + ms_tidy_df.loc[ms_tidy_df['serum cat'] == '[\'COVID-Vax+\']', 'vaccine availability'] = 'post-vax' + cat_df = slice_df(ms_tidy_df, slice_action, 'serum ID', sera_cat_list) + assert not cat_df.empty, 'Plotting dataframe is empty. Please check the plotting keys' + sns.set_context("talk") + + # FIGURE 5A + fig_palette = ["#ee2cef", "#21e020", "#248ff9", "#e7e5e6"] + g = sns.catplot(x="vaccine availability", y="OD", hue=hue, col=split_subplots_by, kind="swarm", + legend=False, + palette=fig_palette, order=["pre-vax", "post-vax"], dodge=True, data=cat_df, col_wrap=3) + + od_medians = cat_df.groupby(['antigen', 'serum cat', 'vaccine availability'])['OD'].median() + median_palette = ["#a318a4", "#18a418", "#185ea4"] + # adding median lines using boxplot functionality: + g.map_dataframe(sns.boxplot, x="vaccine availability", y="OD", hue=hue, + # medianprops={'color': 'k', 'ls': '-', 'lw': 3}, + whiskerprops={'visible': False}, + showfliers=False, showbox=False, showcaps=False, zorder=10, order=["pre-vax", "post-vax"], + palette=median_palette, + dodge=0.55) # change hue + # specifying colors of median lines + for ax in g.axes.flat: + for line in ax.get_lines()[2:10:2]: + line.set_color(median_palette[0]) + for line in ax.get_lines()[10::2]: + line.set_color(median_palette[2]) + for line in ax.get_lines()[1::2]: + line.set_color(median_palette[1]) + # augmenting legend to make text more readable + leg = plt.legend(bbox_to_anchor=(1.02, 0.5), loc='center left', borderaxespad=0, handleheight=0.05, + edgecolor="#000000") + leg.get_texts()[0].set_text( + f'COVID+/Vax+ median (pre-vax -- N: {od_medians[1]:.2f}, RBD: {od_medians[6]:.2f}, Spike: {od_medians[11]:.2f};' + f' post-vax -- N: {od_medians[0]:.2f}, RBD: {od_medians[5]:.2f}, Spike: {od_medians[10]:.2f})') + leg.get_texts()[1].set_text( + f'COVID+/Vax- median (pre-vax -- N: {od_medians[3]:.2f}, RBD: {od_medians[8]:.2f}, Spike: {od_medians[13]:.2f};' + f' post-vax -- N: {od_medians[2]:.2f}, RBD: {od_medians[7]:.2f}, Spike: {od_medians[12]:.2f})') + leg.get_texts()[2].set_text( + f'COVID-/Vax+ median (pre-vax not available; post-vax -- N: {od_medians[4]:.2f}, RBD: {od_medians[9]:.2f},' + f' Spike: {od_medians[14]:.2f})') + leg.get_texts()[3].set_text('COVID+/Vax+') + leg.get_texts()[4].set_text('COVID+/Vax-') + leg.get_texts()[5].set_text('COVID-/Vax+') + # augment axis & tick labels + g.set_axis_labels("vaccine availability", "OD") + g.set_xticklabels(rotation=0, horizontalalignment='center') + # save + plt.savefig(os.path.join(constants.RUN_PATH, 'catplot_{}.svg'.format(split_suffix)), + dpi=300, bbox_inches='tight') + if cat_param_df['zoom']: + g.set(ylim=(-0.05, 0.4)) + plt.savefig(os.path.join(constants.RUN_PATH, 'catplot_zoom_{}.svg'.format(split_suffix)), + dpi=300, bbox_inches='tight') + + # FIGURE 5B -- PLOTTING DELTA OD + # future fixes can involve making the plot fxns for 5a and 5b less redundant + hue = 'serum cat_x' + g = sns.catplot(x="serum cat_x", y="delta OD", hue=hue, col=split_subplots_by, kind="swarm", + palette=fig_palette, + data=delta_df, col_wrap=3, legend=False) + g.map_dataframe(sns.boxplot, x="serum cat_x", y="delta OD", + # medianprops={'color': 'k', 'ls': '-', 'lw': 2}, + whiskerprops={'visible': False}, showfliers=False, hue=hue, + showbox=False, palette=median_palette, + showcaps=False, zorder=10, + dodge=False) + + dod_medians = delta_df.groupby(['antigen', 'serum cat_x'])['delta OD'].median() + + for ax in g.axes.flat: + for line in ax.get_lines()[::2]: + line.set_color(median_palette[0]) + for line in ax.get_lines()[1::2]: + line.set_color(median_palette[1]) + + leg = plt.legend(bbox_to_anchor=(1.02, 0.5), loc='center left', borderaxespad=0, handleheight=0.05, + edgecolor="#000000") # changing the edge color to #000000 did not work but the goal is transparency + leg.get_texts()[0].set_text( + f'COVID+/Vax+ median (N: {dod_medians[0]:.2f}, RBD: {dod_medians[2]:.2f}, Spike: {dod_medians[4]:.2f})') + leg.get_texts()[1].set_text( + f'COVID+/Vax- median (N: {dod_medians[1]:.2f}, RBD: {dod_medians[3]:.2f}, Spike: {dod_medians[5]:.2f})') + leg.get_texts()[2].set_text('COVID+/Vax+') + leg.get_texts()[3].set_text('COVID+/Vax-') + + g.set_axis_labels("cohort", "ΔOD") + ls = ['COVID+/Vax+', 'COVID+/Vax-'] # the same as hue_labels, a bit redundant + g.set_xticklabels(ls, rotation=0) + + plt.savefig(os.path.join(constants.RUN_PATH, 'catplot_deltaod_{}.svg'.format(split_suffix)), + dpi=300, bbox_inches='tight') + if cat_param_df['zoom']: + g.set(ylim=(-0.05, 0.4)) + plt.savefig(os.path.join(constants.RUN_PATH, 'catplot_od_zoom_{}.svg'.format(split_suffix)), + dpi=300, bbox_inches='tight') + #%% 4PL fit + if not fit_param_df.empty: + slice_action = fit_param_df['serum ID action'] + hue = fit_param_df['hue'] + dilution_df = slice_df(df_norm_sub, slice_action, 'serum ID', fit_param_df['serum ID']) + split_subplots_by = fit_param_df['split subplots by'] + #total_plots(dilution_df, constants.RUN_PATH, 'fit_{}'.format(split_suffix), 'png', hue=hue, + #zoom=fit_param_df['zoom'], split_subplots_by=split_subplots_by, col_wrap=2) + plt.close('all') + diff --git a/interpretation/report_reader.py b/multiSero/interpretation/report_reader.py similarity index 99% rename from interpretation/report_reader.py rename to multiSero/interpretation/report_reader.py index 7aa8e4c..2ba5563 100644 --- a/interpretation/report_reader.py +++ b/multiSero/interpretation/report_reader.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd import logging -import array_analyzer.extract.constants as constants +import multiSero.array_analyzer.extract.constants as constants def antigen2D_to_df1D(xlsx_path, sheet, data_col): """ @@ -47,7 +47,7 @@ def read_plate_info(metadata_xlsx): 'serum cat', 'secondary ID', 'secondary dilution', - 'PRNT', + 'visit value', 'sample type'] plate_info_df = pd.DataFrame() # get sheet names that are available in metadata diff --git a/interpretation/train_classifier.py b/multiSero/interpretation/train_classifier.py similarity index 98% rename from interpretation/train_classifier.py rename to multiSero/interpretation/train_classifier.py index 39d6afd..4fe34f8 100644 --- a/interpretation/train_classifier.py +++ b/multiSero/interpretation/train_classifier.py @@ -22,9 +22,9 @@ import pandas as pd import unicodedata import xgboost as xgb -import array_analyzer.utils.io_utils as io_utils -from interpretation.plotting import get_roc_df, roc_plot_grid, thr_plot_grid -from interpretation.report_reader import slice_df, normalize_od, offset_od +import multiSero.array_analyzer.utils.io_utils as io_utils +from multiSero.plotting.plotting import get_roc_df, roc_plot_grid, thr_plot_grid +from multiSero.interpretation.report_reader import slice_df, normalize_od, offset_od from sklearn import metrics from sklearn.model_selection import GridSearchCV, GroupKFold, GroupShuffleSplit from sklearn.linear_model import LogisticRegressionCV diff --git a/multisero.py b/multiSero/multisero.py similarity index 93% rename from multisero.py rename to multiSero/multisero.py index 12787a4..a35f8cd 100644 --- a/multisero.py +++ b/multiSero/multisero.py @@ -2,12 +2,12 @@ import logging import os -import array_analyzer.extract.constants as constants -import array_analyzer.utils.io_utils as io_utils -import array_analyzer.workflows.registration_workflow as registration_wf -import array_analyzer.workflows.interpolation_wf as interpolation_wf -import array_analyzer.workflows.well_wf as well_wf -import interpretation.od_analyzer as od_analyzer +import multiSero.array_analyzer.extract.constants as constants +import multiSero.array_analyzer.utils.io_utils as io_utils +import multiSero.array_analyzer.workflows.registration_workflow as registration_wf +from multiSero import array_analyzer as interpolation_wf +import multiSero.array_analyzer.workflows.well_wf as well_wf +import multiSero.interpretation.od_analyzer as od_analyzer import matplotlib matplotlib.use('Agg') diff --git a/multiSero/plotting/__init__.py b/multiSero/plotting/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/interpretation/plotting.py b/multiSero/plotting/plotting.py similarity index 100% rename from interpretation/plotting.py rename to multiSero/plotting/plotting.py diff --git a/tests/extract/test_metadata.py b/tests/extract/test_metadata.py index 96dd95d..9559ff2 100644 --- a/tests/extract/test_metadata.py +++ b/tests/extract/test_metadata.py @@ -1,7 +1,7 @@ import pytest -from array_analyzer.extract.metadata import MetaData -import array_analyzer.extract.constants as constants +from multiSero.array_analyzer.extract.metadata import MetaData +import multiSero.array_analyzer.extract.constants as constants """ Tests to add: diff --git a/tests/load/report_test.py b/tests/load/report_test.py index c47bf79..d6d50a6 100644 --- a/tests/load/report_test.py +++ b/tests/load/report_test.py @@ -4,8 +4,8 @@ import pandas as pd import pytest -import array_analyzer.extract.constants as constants -import array_analyzer.load.report as report +import multiSero.array_analyzer.extract.constants as constants +import multiSero.array_analyzer.load.report as report @pytest.fixture diff --git a/tests/pysero_test.py b/tests/pysero_test.py index f223525..a720524 100644 --- a/tests/pysero_test.py +++ b/tests/pysero_test.py @@ -3,7 +3,7 @@ import pytest from unittest.mock import patch -import multisero as multisero +from multiSero import multisero as multisero def test_parse_args(): diff --git a/tests/test_debug_plots.py b/tests/test_debug_plots.py index c4d6417..fedafa1 100644 --- a/tests/test_debug_plots.py +++ b/tests/test_debug_plots.py @@ -1,9 +1,3 @@ -import pytest - -import array_analyzer.load as load -import os - - def test_nonetype_integration(): pass diff --git a/tests/transform/point_registration_test.py b/tests/transform/point_registration_test.py index 0dd98b1..7d1426a 100644 --- a/tests/transform/point_registration_test.py +++ b/tests/transform/point_registration_test.py @@ -1,8 +1,8 @@ import numpy as np import pytest -import array_analyzer.extract.constants as constants -import array_analyzer.transform.point_registration as registration +import multiSero.array_analyzer.extract.constants as constants +import multiSero.array_analyzer.transform.point_registration as registration @pytest.fixture diff --git a/tests/utils/io_utils_test.py b/tests/utils/io_utils_test.py index 15a40ce..9e42b5e 100644 --- a/tests/utils/io_utils_test.py +++ b/tests/utils/io_utils_test.py @@ -1,7 +1,7 @@ import numpy as np import os import pytest -import array_analyzer.utils.io_utils as io_utils +import multiSero.array_analyzer.utils.io_utils as io_utils def test_read_gray_im(image_dir): diff --git a/tests/utils/spot_regionprop_test.py b/tests/utils/spot_regionprop_test.py index 667f401..5e4abe1 100644 --- a/tests/utils/spot_regionprop_test.py +++ b/tests/utils/spot_regionprop_test.py @@ -1,7 +1,7 @@ import numpy as np import pytest -import array_analyzer.utils.spot_regionprop as regionprop +import multiSero.array_analyzer.utils.spot_regionprop as regionprop @pytest.fixture