diff --git a/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/PC_vs_computed_features.py b/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/PC_vs_computed_features.py index 0e7e0dc3..2428aa2c 100644 --- a/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/PC_vs_computed_features.py +++ b/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/PC_vs_computed_features.py @@ -4,158 +4,102 @@ """ # %% +from anndata import read_zarr from pathlib import Path import matplotlib.pyplot as plt import seaborn as sns -from compute_pca_features import compute_correlation_and_save_png, compute_features - -# %% for sensor features - -features_path = Path( - "/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/trainng_logs/SEC61/rev6_NTXent_sensorPhase_infection/2chan_160patch_94ckpt_rev6_2.zarr" -) -data_path = Path( - "/hpc/projects/intracellular_dashboard/organelle_dynamics/2024_02_04_A549_DENV_ZIKV_timelapse/8-train-test-split/registered_test.zarr" -) -tracks_path = Path( - "/hpc/projects/intracellular_dashboard/organelle_dynamics/2024_02_04_A549_DENV_ZIKV_timelapse/8-train-test-split/track_test.zarr" -) - -source_channel = ["Phase3D", "RFP"] -seg_channel = ["Nuclei_prediction_labels"] -z_range = (28, 43) -fov_list = ["/A/3", "/B/3", "/B/4"] - -features_sensor = compute_features( - features_path, - data_path, - tracks_path, - source_channel, - seg_channel, - z_range, - fov_list, -) - -features_sensor.to_csv( - "/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/Figure_panels/cell_division/features_allset_sensor.csv", - index=False, -) - -# features_sensor = pd.read_csv("/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/Figure_panels/cell_division/features_allset_sensor.csv") - -# take a subset without the 768 features -feature_columns = [f"feature_{i + 1}" for i in range(768)] -features_subset_sensor = features_sensor.drop(columns=feature_columns) -correlation_sensor = compute_correlation_and_save_png( - features_subset_sensor, - "/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/Figure_panels/cell_division/PC_vs_CF_2chan_pca_sensor_allset.svg", -) - -# %% plot PCA vs set of computed features for sensor features - -set_features = [ - "Fluor Radial Intensity Gradient", - "Phase Interquartile Range", - "Perimeter area ratio", - "Fluor Interquartile Range", - "Phase Entropy", - "Fluor Zernike Moment Mean", -] - -plt.figure(figsize=(10, 8)) -sns.heatmap( - correlation_sensor.loc[set_features, "PCA1":"PCA6"], - annot=True, - cmap="coolwarm", - fmt=".2f", - annot_kws={"size": 24}, - vmin=-1, - vmax=1, -) -plt.xlabel("Computed Features", fontsize=24) -plt.ylabel("PCA Features", fontsize=24) -plt.xticks(fontsize=24) # Increase x-axis tick labels -plt.yticks(fontsize=24) # Increase y-axis tick labels - -plt.savefig( - "/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/Figure_panels/cell_division/PC_vs_CF_2chan_pca_allset_sensor_6features.svg" -) - -# plot the PCA1 vs PCA2 map for sensor features - -plt.figure(figsize=(10, 10)) -sns.scatterplot( - x="PCA1", - y="PCA2", - data=features_sensor, -) - - -# .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. .-.-. -# / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ / / \ \ -# '-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-'-' '-' - +import pandas as pd +from compute_pca_features import compute_correlation_and_save_png # %% for organelle features -features_path = Path( - "/hpc/projects/intracellular_dashboard/organelle_dynamics/2024_11_07_A549_SEC61_ZIKV_DENV/4-phenotyping/predictions/Soorya/timeAware_2chan_ntxent_192patch_91ckpt_rev7_GT.zarr" -) -data_path = Path( - "/hpc/projects/intracellular_dashboard/organelle_dynamics/2024_11_07_A549_SEC61_ZIKV_DENV/2-assemble/2024_11_07_A549_SEC61_ZIKV_DENV.zarr" -) -tracks_path = Path( - "/hpc/projects/intracellular_dashboard/organelle_dynamics/2024_11_07_A549_SEC61_ZIKV_DENV/1-preprocess/label-free/4-track-gt/2024_11_07_A549_SEC61_ZIKV_DENV_2_cropped.zarr" -) - -source_channel = ["Phase3D", "raw GFP EX488 EM525-45"] -seg_channel = ["nuclei_prediction_labels_labels"] -z_range = (16, 21) -normalizations = None -fov_list = ["/B/2/000000", "/B/3/000000", "/C/2/000000"] - -features_organelle = compute_features( - features_path, - data_path, - tracks_path, - source_channel, - seg_channel, - z_range, - fov_list, +def convert_to_dataframe(embeddings_dataset): + features_df = embeddings_dataset.obs + embedding_features = embeddings_dataset.X + embedding_features_df = pd.DataFrame(embedding_features, columns=[f"feature_{i+1}" for i in range(embedding_features.shape[1])], index=features_df.index) + features_df = pd.concat([features_df, embedding_features_df], axis=1) + return features_df + +def match_computed_features_to_embeddings( + embeddings_path: Path, + computed_features_path: Path, + wells: list, +): + embeddings_dataset = read_zarr(embeddings_path) + computed_features = pd.read_csv(computed_features_path) + + # Extract well from fov_name with better handling + computed_features_split = computed_features["fov_name"].str.split("/") + computed_features["well"] = computed_features_split.str[1].fillna("") + "/" + computed_features_split.str[2].fillna("") + + features_df = convert_to_dataframe(embeddings_dataset) + + # Extract well from fov_name with better handling + features_split = features_df["fov_name"].str.split("/") + features_df["well"] = features_split.str[0].fillna("") + "/" + features_split.str[1].fillna("") + + features_df_filtered = features_df[features_df["well"].isin(wells)].copy() + computed_features_df = computed_features[computed_features["well"].isin(wells)].copy() + + # Prepare columns for merge - strip '/' from fov_name in both dataframes + computed_features_df["fov_name_clean"] = computed_features_df["fov_name"].str.strip('/') + features_df_filtered["fov_name_clean"] = features_df_filtered["fov_name"].astype(str).str.strip('/') + + # Rename columns to avoid conflicts during merge + # Rename 't' in features_df_filtered to 'time_point' to match computed_features + features_df_filtered = features_df_filtered.rename(columns={"t": "time_point"}) + + # Ensure merge keys have compatible types - use float for track_id + computed_features_df["track_id"] = pd.to_numeric(computed_features_df["track_id"], errors='coerce') + features_df_filtered["track_id"] = pd.to_numeric(features_df_filtered["track_id"], errors='coerce') + computed_features_df["time_point"] = pd.to_numeric(computed_features_df["time_point"], errors='coerce') + features_df_filtered["time_point"] = pd.to_numeric(features_df_filtered["time_point"], errors='coerce') + + # This performs a vectorized join operation + merged_df = pd.merge( + computed_features_df, + features_df_filtered, + on=["fov_name_clean", "track_id", "time_point"], + how="inner", + suffixes=("_computed", "_embedding") + ) + + # Drop the temporary fov_name_clean column + merged_df = merged_df.drop(columns=["fov_name_computed"]) + + return merged_df + +embeddings_path = Path( + "/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/4-phenotyping/predictions/organelle_160patch_104ckpt_ver3max.zarr" +) +computed_features_path = Path( + "/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/4-phenotyping/predictions/quantify_remodeling/G3BP1/feature_list_G3BP1_2025_07_22_160patch.csv" +) + +wells = ["C/2"] +computed_features_df = match_computed_features_to_embeddings( +embeddings_path, +computed_features_path, +wells, ) # Save the features dataframe to a CSV file -features_organelle.to_csv( - "/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/Figure_panels/cell_division/features_twoChan_organelle_multiwell.csv", +computed_features_df.to_csv( + "/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/4-phenotyping/predictions/quantify_remodeling/G3BP1/features_organelle_wellC2_160patch.csv", index=False, ) + +# %% compute the correlation between PCA and computed features and plot + +computed_features_df = pd.read_csv("/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/4-phenotyping/predictions/quantify_remodeling/G3BP1/features_organelle_wellC2_160patch.csv") correlation_organelle = compute_correlation_and_save_png( - features_organelle, - "/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/Figure_panels/cell_division/PC_vs_CF_2chan_pca_organelle_multiwell.svg", + computed_features_df, + "/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/4-phenotyping/predictions/quantify_remodeling/G3BP1/PC_vs_CF_organelle_wellC2_160patch.svg", ) # features_organelle = pd.read_csv("/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/Figure_panels/cell_division/features_twoChan_organelle_multiwell_refinedPCA.csv") -# %% plot PCA vs set of computed features for organelle features - -plt.figure(figsize=(10, 8)) -sns.heatmap( - correlation_organelle.loc[set_features, "PCA1":"PCA6"], - annot=True, - cmap="coolwarm", - fmt=".2f", - annot_kws={"size": 24}, - vmin=-1, - vmax=1, -) -plt.xlabel("Computed Features", fontsize=24) -plt.ylabel("PCA Features", fontsize=24) -plt.xticks(fontsize=24) # Increase x-axis tick labels -plt.yticks(fontsize=24) # Increase y-axis tick labels -plt.savefig( - "/hpc/projects/comp.micro/infected_cell_imaging/Single_cell_phenotyping/ContrastiveLearning/Figure_panels/cell_division/PC_vs_CF_2chan_pca_setfeatures_organelle_6features.svg" -) # %% diff --git a/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/compute_image_features.py b/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/compute_image_features.py new file mode 100644 index 00000000..640f6e5a --- /dev/null +++ b/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/compute_image_features.py @@ -0,0 +1,171 @@ +# %% code for organelle and nuclear segmentation and feature extraction + +from iohub import open_ome_zarr +import numpy as np +# from iohub.ngff.utils import create_empty_plate +from skimage.filters import gabor +from skimage.feature import graycomatrix, graycoprops, local_binary_pattern, hog, canny +import pywt +from skimage.transform import resize +from skimage.util import img_as_float +from scipy.ndimage import convolve +import pandas as pd +from matplotlib import pyplot as plt +from skimage.measure import regionprops +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler +from sklearn.cluster import KMeans +import os +import seaborn as sns +from scipy.ndimage import label + +# %% + +input_path = '/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/4-phenotyping/train-test/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV.zarr' +input_zarr = open_ome_zarr(input_path, mode='r+', layout='hcs') +in_chans = input_zarr.channel_names +Organelle_chan = 'raw GFP EX488 EM525-45' + +org_seg_path = '/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/4-phenotyping/predictions/quantify_remodeling/G3BP1/segmentation_G3BP1_puncta.zarr' +seg_zarr = open_ome_zarr(org_seg_path, mode='r+', layout='hcs') +seg_chans = seg_zarr.channel_names +Organelle_seg = 'Organelle_mask' + +nucl_seg_path = '/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/1-preprocess/label-free/3-track/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV_cropped.zarr' +nucl_seg_zarr = open_ome_zarr(nucl_seg_path, mode='r+', layout='hcs') +nucl_seg_chans = nucl_seg_zarr.channel_names +Nucl_seg = 'nuclei_prediction_labels_labels' + +# %% library of feature extraction methods + +def get_patch(data, cell_centroid, patch_size): + + x_centroid, y_centroid = cell_centroid + # ensure patch boundaries stay within data dimensions + x_start = max(0, x_centroid - patch_size//2) + x_end = min(data.shape[1], x_centroid + patch_size//2) + y_start = max(0, y_centroid - patch_size//2) + y_end = min(data.shape[0], y_centroid + patch_size//2) + + # get patch of patch_size centered on centroid + patch = data[int(y_start):int(y_end),int(x_start):int(x_end)] + return patch + +def compute_glycomatrix(data): + + # convert to uint8 for GLCM computation + norm_data = ((data-np.min(data))/(np.max(data)-np.min(data)) * 255).astype(np.uint8) + glcm = graycomatrix(norm_data, distances=[1, 2, 3], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], levels=256, symmetric=True, normed=True) + contrast = np.mean(graycoprops(glcm, 'contrast')) + homogeneity = np.mean(graycoprops(glcm, 'homogeneity')) + energy = np.mean(graycoprops(glcm, 'energy')) + correlation = np.mean(graycoprops(glcm, 'correlation')) + return contrast, homogeneity, energy, correlation + +def compute_edge_density(data): + + # Convert to float for skimage.canny + norm_data = ((data-np.min(data))/(np.max(data)-np.min(data)) * 255).astype(np.uint8) + norm_data_float = img_as_float(norm_data) + + # Apply Canny edge detection + edges = canny(norm_data_float, sigma=1.0) # sigma controls the scale of detection + + # Edge density: ratio of edge pixels to total pixels + edge_density = np.sum(edges) / edges.size + + return edge_density + + +def compute_organelle_frac_int(org_data, org_mask): + org_cell_mask = org_mask + org_masked = org_data * org_mask + organelle_volume = np.sum(org_cell_mask) + organelle_intensity = np.mean(org_masked) + # change the mask to instance level segmentation mask using connected components + org_mask_labeled, num_organelles = label(org_cell_mask) + # average of size of each organelle + size_organelle = [] + for i in range(1, num_organelles + 1): + size_organelle.append(np.sum(org_mask_labeled == i)) + size_organelle = np.mean(size_organelle) + return organelle_volume, organelle_intensity, num_organelles, size_organelle + +# %% + +feature_list = pd.DataFrame() +patch_size = 192 + +for well_id, well_data in seg_zarr.wells(): +# well_id, well_data = next(seg_zarr.wells()) + well_name, well_no = well_id.split('/') + if well_name == 'C': + for pos_id, pos_data in well_data.positions(): + # pos_id, pos_data = next(well_data.positions()) + T, C, Z, Y, X = pos_data.data.shape + seg_data = pos_data.data.numpy() + org_seg_mask = seg_data[:,seg_chans.index(Organelle_seg)] + + # read the csv stored in each nucl seg zarr folder + file_name = 'tracks_' + well_name + '_' + well_no + '_' + pos_id + '.csv' + nucl_seg_csv = os.path.join(nucl_seg_path, well_id+'/'+pos_id+'/'+file_name) + nucl_seg_df = pd.read_csv(nucl_seg_csv) + + in_data = input_zarr[well_id+'/'+pos_id].data.numpy() + organelle_data = in_data[:,in_chans.index(Organelle_chan)] + + # Initialize an empty list to store values from each row of the csv + position_features = [] + for idx,row in nucl_seg_df.iterrows(): + row = nucl_seg_df.iloc[idx] + + if row['x'] > patch_size//2 and row['y'] > patch_size//2 and row['x'] < X - patch_size//2 and row['y'] < Y - patch_size//2: + cell_centroid = row['x'], row['y'] + + timepoint = row['t'] + organelle_patch = get_patch(organelle_data[int(timepoint),0], cell_centroid, patch_size) + org_seg_patch = get_patch(org_seg_mask[int(timepoint),0], cell_centroid, patch_size) + + # compute features from image patches around nuclear centroid + contrast, homogeneity, energy, correlation = compute_glycomatrix(organelle_patch) + edge_density = compute_edge_density(organelle_patch) + + # compute organelle morphology features using organelle segmentation + organelle_volume, organelle_intensity, no_organelles, size_organelle = compute_organelle_frac_int(organelle_patch, org_seg_patch) + + # Create a dictionary of features for this cell + cell_features = { + 'fov_name': '/'+well_id+'/'+pos_id, + 'track_id': row['track_id'], + 'time_point': timepoint, + 'x': row['x'], + 'y': row['y'], + 'contrast': contrast, + 'homogeneity': homogeneity, + 'energy': energy, + 'correlation': correlation, + 'edge_density': edge_density, + 'organelle_volume': organelle_volume, + 'organelle_intensity': organelle_intensity, + 'no_organelles': no_organelles, + 'size_organelle': size_organelle, + } + position_features.append(cell_features) + + # After processing all cells in this position, write to CSV + if position_features: + # Convert the list of dictionaries to a DataFrame + position_df = pd.DataFrame(position_features) + + # Define the output file path + output_file = '/hpc/projects/intracellular_dashboard/organelle_dynamics/2025_07_22_A549_SEC61_TOMM20_G3BP1_ZIKV/4-phenotyping/predictions/quantify_remodeling/G3BP1/feature_list_G3BP1_2025_07_22_192patch.csv' + + # Write to CSV - append if file exists, create new if it doesn't + position_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False) + + # Clear the list for the next position + position_features = [] + + print(f"Processed position {pos_id} in well {well_id}") + +# %% diff --git a/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/compute_pca_features.py b/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/compute_pca_features.py index eca2adaf..d630499d 100644 --- a/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/compute_pca_features.py +++ b/applications/contrastive_phenotyping/evaluation/pc_vs_computed_features/compute_pca_features.py @@ -338,27 +338,37 @@ def compute_correlation_and_save_png(features: pd.DataFrame, filename: str): # remove the rows with missing values features = features.dropna() - # sub_features = features[features["Time"] == 20] - feature_df_removed = features.drop( - columns=["fov_name", "track_id", "t", "id", "parent_track_id", "parent_id"] - ) + # get the feature values (get the values under headers that start with "feature_") + feature_values = features.filter(like="feature_") + + # compute the PCA features + pca = PCA(n_components=10) + pca_features = pca.fit_transform(feature_values.values) + correlation_df = pd.DataFrame(pca_features, columns=[f"PCA{i+1}" for i in range(pca_features.shape[1])], index=features.index) + + # get the computed features like 'contrast', 'homogeneity', 'energy', 'correlation', 'edge_density', 'organelle_volume', 'organelle_intensity', 'no_organelles', 'size_organelles' + image_features_df = features.filter(regex="contrast|homogeneity|energy|correlation|edge_density|organelle_volume|organelle_intensity|no_organelles|size_organelles").copy() + + # Remove duplicate columns if any exist + correlation_df = pd.concat([correlation_df, image_features_df], axis=1) + correlation_df = correlation_df.loc[:, ~correlation_df.columns.duplicated()] # Compute correlation between PCA features and computed features - correlation = feature_df_removed.corr(method="spearman") + correlation = correlation_df.corr(method="spearman") + + # Select only PCA features (rows) and computed features (columns) + pca_columns = [col for col in correlation.columns if col.startswith('PCA')] + computed_columns = [col for col in correlation.columns if not col.startswith('PCA')] + + # Remove duplicates from computed_columns list + computed_columns = list(dict.fromkeys(computed_columns)) + + # Select the subset: PCA features as rows, computed features as columns + correlation_selected = correlation.loc[pca_columns, computed_columns] # display PCA correlation as a heatmap - - plt.figure(figsize=(30, 10)) - sns.heatmap( - correlation.drop( - columns=["PCA1", "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", "PCA7", "PCA8"] - ).loc["PCA1":"PCA8", :], - annot=True, - cmap="coolwarm", - fmt=".2f", - annot_kws={"size": 18}, - cbar=False, - ) + plt.figure(figsize=(10, 15)) + sns.heatmap(abs(correlation_selected), annot=True, cmap="coolwarm", fmt=".2f", annot_kws={"size": 18}, cbar=False) plt.title("Correlation between PCA features and computed features", fontsize=12) plt.xlabel("Computed Features", fontsize=18) plt.ylabel("PCA Features", fontsize=18) @@ -368,12 +378,29 @@ def compute_correlation_and_save_png(features: pd.DataFrame, filename: str): # Adjust layout to prevent label cutoff plt.tight_layout() - plt.savefig( - filename, - dpi=300, - bbox_inches="tight", - pad_inches=0.5, # Add padding around the figure - ) + # plt.savefig( + # filename, + # dpi=300, + # bbox_inches="tight", + # pad_inches=0.5, # Add padding around the figure + # format="svg", + # ) plt.close() + # display the correaltion map with image feature order: edge_density>correlation>energy>homogeneity>contrast>no_organelles>organelle_volume>organelle_intensity + # Select columns (features) in the desired order + feature_order = ["edge_density", "correlation", "energy", "homogeneity", "contrast", "no_organelles", "organelle_volume", "organelle_intensity"] + # Filter to only include features that actually exist in the dataframe + feature_order_filtered = [f for f in feature_order if f in correlation_selected.columns] + correlation_selected_sorted = correlation_selected[feature_order_filtered] + + plt.figure(figsize=(10, 7)) + sns.heatmap(np.transpose(correlation_selected_sorted), annot=True, cmap="coolwarm", fmt=".2f", annot_kws={"size": 18}, cbar=False) + plt.title("Correlation between PCA features and computed features", fontsize=12) + plt.ylabel("Computed Features", fontsize=18) + plt.xlabel("PCA Features", fontsize=18) + plt.xticks(fontsize=18, rotation=45, ha="right") # Rotate labels and align them + plt.yticks(fontsize=18) + plt.savefig(filename, dpi=300, bbox_inches="tight", pad_inches=0.5,format="svg") + plt.close() return correlation