diff --git a/BoolODE/__init__.py b/BoolODE/__init__.py index 70f71a4..a2bc9fc 100644 --- a/BoolODE/__init__.py +++ b/BoolODE/__init__.py @@ -36,12 +36,14 @@ def __init__(self, dimred_jobs, slingshot_jobs, gensample_jobs, + compare_ss_jobs, geneexpression_jobs) -> None: self.dropout_jobs = dropout_jobs self.dimred_jobs = dimred_jobs self.slingshot_jobs = slingshot_jobs self.gensample_jobs = gensample_jobs + self.compare_ss_jobs = compare_ss_jobs self.geneexpression_jobs = geneexpression_jobs class BoolODE(object): @@ -147,6 +149,7 @@ def do_post_processing(self): if self.post_settings.dropout_jobs\ or self.post_settings.dimred_jobs\ or self.post_settings.geneexpression_jobs\ + or self.post_settings.compare_ss_jobs\ or self.post_settings.slingshot_jobs: doOtherAnalysis = True generatedPaths = {} @@ -203,7 +206,37 @@ def do_post_processing(self): po.genDropouts(settings) if num_invalid == len(alljobs): break - + # compare_ss_jobs (compare steady states jobs) + if self.post_settings.compare_ss_jobs is not None: + print("Starting to compare steady states...") + for ss_comparison_job in self.post_settings.compare_ss_jobs: + for jobid in alljobs: + for gsampPath in generatedPaths[jobid]: + # outputPath = self.jobs[jobid]['outputPath'] + settings = {} + settings['outputPath'] = ss_comparison_job.get('outputPath') + settings['perform_PyBoolNet'] = ss_comparison_job.get('perform_PyBoolNet', False) + # To use genSamples generated ExpressionData for + # this specific post-processing test as ExpressionData + # settings['expressionDataFileLocation'] = Path(gsampPath,\ + # 'ExpressionData.csv') + + # Use BoolODE output (the one the user sees) as ExpressionData + settings['expressionDataFileLocation'] = str( + self.jobs[jobid]['outprefix']) + "/ExpressionData.csv" + + data_file_name = "" + for jobid2, joby in enumerate(self.job_settings.jobs): + data_file_name = str(Path(self.global_settings.model_dir, joby.get('model_definition', ''))) + parent_directory_path = Path(self.global_settings.model_dir) + parent_directory_path = parent_directory_path.parent.absolute() + + # Will need to know what model is being analyzed if + # it is running PyBoolNet + + settings['modelPath'] = str(parent_directory_path) + "/" + data_file_name + po.compare_ss(settings) + if self.post_settings.dimred_jobs is not None: print("Starting dimesionality reduction using tSNE") for dimred_jobs in self.post_settings.dimred_jobs: @@ -342,7 +375,8 @@ def __parse_postproc_settings(input_settings_map) -> GlobalSettings: slingshot_jobs = input_settings_map.get('Slingshot', None) dimred_jobs = input_settings_map.get('DimRed', None) gensample_jobs = input_settings_map.get('GenSamples', None) + compare_ss_jobs = input_settings_map.get('CompareSteadyStates', None) geneexpression_jobs = input_settings_map.get('GeneExpression', None) return PostProcSettings(dropout_jobs, dimred_jobs, slingshot_jobs, gensample_jobs, - geneexpression_jobs) + compare_ss_jobs, geneexpression_jobs) diff --git a/BoolODE/compare_steady_states.py b/BoolODE/compare_steady_states.py new file mode 100644 index 0000000..75c6d2c --- /dev/null +++ b/BoolODE/compare_steady_states.py @@ -0,0 +1,241 @@ +""" + This script DBSCAN clustering + on a provided ExpressionData.CSV file to estimate + the number of steady states for a particular boolean model. + + @Author: Neel Patel (neelspatel999@vt.edu) + @Author: Madeline Shaklee (mshaklee@umassd.edu) + @Author: Amelia Whitehead (ameliafw@vt.edu) +""" + +def compare_ss(expression_data_location, output_file_path, compare_with_pyboolnet, model_path): + """ + Main caller of any steady state comparison techniques. + + @Params: expression_data_location: the full file path of ExpressionData + output_file_path: the file path where output files should go + compare_with_pyboolnet: 1 if compare output with PyBoolNet, 0 otherwise + model_path: States the path of the model. Needed for PyBoolNet to analyze steady states + """ + import os + # Perform DBSCAN, a clustering technique to approximate the number of steady states. + n_clusters_ = run_dbscan_clustering(expression_data_location, output_file_path) + + # (Optional) Perform PyBoolNet to calculate the steady states and report the number of steady states. + pyboolnet_result = -1 + if compare_with_pyboolnet: + # Note: run_pyboolnet still has initial conditions section to be added + pyboolnet_result = run_pyboolnet(False, model_path) + + # Only do these print statements if PyBoolNet was ran. + if pyboolnet_result != -1: + if n_clusters_ == pyboolnet_result: + if n_clusters_ == 1: + print("\nDBSCAN and PyBoolNet results match and found " + str(n_clusters_) + " steady state\n") + else: + print("\nDBSCAN and PyBoolNet results match and found " + str(n_clusters_) + " steady states\n") + else: + print("\nDBSCAN and PyBoolNet outputs did not match. \nDBSCAN: " + str(n_clusters_) + ", PyBoolNet: " + str( + pyboolnet_result) + "\n") + output_file_path = os.path.join(output_file_path, "steady_state_comparison.txt") + outputfile = open(output_file_path, "w") + outputfile.write("DBSCAN Number of Steady State Estimation: " + str(n_clusters_) + "\n") + if pyboolnet_result != -1: + outputfile.write("PyBoolNet Number of Steady States: " + str(pyboolnet_result) + "\n") + outputfile.write("\n") + if n_clusters_ == pyboolnet_result: + if n_clusters_ == 1: + outputfile.write("DBSCAN and PyBoolNet outputs match and found " + str(n_clusters_) + " steady state") + else: + outputfile.write("DBSCAN and PyBoolNet outputs match and found " + str(n_clusters_) + " steady states") + else: + outputfile.write("DBSCAN and PyBoolNet outputs did not match.") + else: + outputfile.write("PyBoolNet was not ran because it was not requested") + print("Steady state comparison results were written to steady_state_comparison.txt\n") + + + +def run_dbscan_clustering(expression_data_location, output_file_path): + """ + Performs DBSCAN on the ExpressionData file and generates a DBSCAN_ClusterIDs.csv file, which specifies + which cell belongs to what cluster, according to DBSCAN. All noise points are grouped together into + a separate cluster for visualization purposes. + + @Precondition: The number of samples (cells) needs to be greater than two + times the number of genes + + @Note: DBSCAN does not work well with datasets of varying density + + @Params: expression_data_location: the full file path of ExpressionData + output_file_path: the file path where output files should go + + @Citations: https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py + https://medium.com/@tarammullin/dbscan-parameter-estimation-ff8330e3a3bd + """ + import os + from sklearn.cluster import DBSCAN + from sklearn.neighbors import NearestNeighbors + from kneed import KneeLocator + import numpy as np + import pandas + + # ExpressionData is a matrix, where rows are genes and columns are cells + expression_df = pandas.read_csv(expression_data_location, sep=',', index_col=0) + + # Performs transpose for later clustering on the samples. + # We cluster the cell columns of ExpressionData + expression_data_transpose = np.array(expression_df.T.to_numpy().tolist()) + # ExpressionData is a matrix, where rows are genes and columns are cells + # expression_df = pandas.read_csv(expression_data_location, sep=',', index_col=0) + + # Performs transpose for later clustering on the samples. + # We cluster the cell columns of ExpressionData + # expression_data_transpose = np.array(data_frame.tolist()) + + # DBSCAN takes two parameters, MinSamples and Epsilon + # MinSamples is described now, and Epsilon will be described later + + # The MinSamples parameter is the fewest number of samples (cells) + # DBSCAN will put in a cluster. We assign MinSamples + # to two times the number of genes in ExpressionData + min_samples_num = len(expression_df.index) * 2 + + ############# Below is from Medium.com reference ############# + + # Compute nearest neighbors and distances + neighbors = NearestNeighbors(n_neighbors=min_samples_num) + + neighbors_fit = neighbors.fit(expression_data_transpose) + distances, indices = neighbors_fit.kneighbors(expression_data_transpose) + + # Sort distances in ascending order + distances = np.sort(distances, axis=0) + + ############# End of Medium.com reference ############# + + # Find the average k-distances then plot to find the knee, + # which is the point of maximum curvature + # y_values = compute_average_distances(distances) + ############################################################# + denominator_of_average = len(distances[0]) + # Will be of length number of samples (cells) in the + # ExpressionData, once full + + average_distances = [] + for distances_element in distances: + distance_sum = 0 + for distance_value in distances_element: + distance_sum = distance_sum + distance_value + average_distance = distance_sum / denominator_of_average + average_distances.append(average_distance) + y_values = average_distances + ############################################ + x_values = list(range(1, len(y_values) + 1)) + knee_locator = KneeLocator(x_values, y_values, S=1.0, curve='convex', direction='increasing') + maximum_curvature_position = round(knee_locator.knee, 20) + + # The Epsilon parameter of DBSCAN represents the upper bound (exclusive) + # distance between two points to be clustered together. + epsilon = y_values[maximum_curvature_position - 1] + + ############# Below is from the scikit-learn reference ############# + + # Performs DBSCAN with the calculated parameters + db = DBSCAN(eps=epsilon, min_samples=min_samples_num).fit(expression_data_transpose) + + clusters_identifiers = db.fit_predict(expression_data_transpose) + core_samples_mask = np.zeros_like(db.labels_, dtype=bool) + core_samples_mask[db.core_sample_indices_] = True + labels = db.labels_ + + # Number of clusters in labels, ignoring noise if present. + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + n_noise_ = list(labels).count(-1) + + print("DBSCAN result:") + print('Estimated number of clusters: %d' % n_clusters_) + print('Estimated number of noise points: %d \n' % n_noise_) + + ############# End of scikit-learn reference ############# + + # Adding the cluster labels to the dataframe + expression_df.loc['cluster_labels', :] = clusters_identifiers + + # Write a new ClusterID file according to DBSCAN's clustering + cluster_values = expression_df.loc['cluster_labels'].tolist() + cluster_values = [int(number) for number in cluster_values] + cell_names = expression_df.columns.tolist() + dictionary = {'': cell_names, 'cl': cluster_values} + cluster_df = pandas.DataFrame(dictionary) + output_file_path = os.path.join(output_file_path, 'DBSCAN_ClusterIDs.csv') + cluster_df.to_csv(output_file_path, index=False) + + print("DBSCAN analysis generated a DBSCAN_ClusterIDs.csv file.") + return n_clusters_ + + +# Runs PyBoolNet +def run_pyboolnet(ics_present, model_path): + """ + Performs PyBoolNet to get the steady states. Information on PyBoolNet can be + found at https://pyboolnet.readthedocs.io/en/master/ + + @Params: ics_present: True if initial conditions are present, False otherwise + model_path: States the path of the model. Needed for PyBoolNet to analyze steady states + """ + #### Still has initial conditions section to be added + print("Starting PyBoolNet...") + + import PyBoolNet + import pandas as pd + import os + + input_file = open(model_path, "r") + temp_bnet_file = open(model_path + "_temporary_bnet.txt", "x") + for line in input_file: + # If it is not the first line + if "".join(line.split()).lower() != "generule": + # Convert the format and remove tabs. + py_formatted_line = line.replace('\t', ',', 1) + py_formatted_line = py_formatted_line.replace("and", "&") + py_formatted_line = py_formatted_line.replace("or", "|") + py_formatted_line = py_formatted_line.replace("not", "!") + py_formatted_line = py_formatted_line.replace('\t', ' ') + temp_bnet_file.write(py_formatted_line) + temp_bnet_file.write('\n') + + input_file.close() + temp_bnet_file.close() + + primes = PyBoolNet.FileExchange.bnet2primes(model_path + "_temporary_bnet.txt") + steady_states = PyBoolNet.AspSolver.steady_states(primes) + df = pd.DataFrame(steady_states) + + print("PyBoolNet found " + str(len(df)) + " steady state/s total") + + # Delete the temporary file. + os.remove(model_path + "_temporary_bnet.txt") + + # Initial conditions and reachability from a given initial condition is TBD + # We currently always set this to always false. Still need to work on this section + unreachable_steady_states = 0 + if ics_present: + current_goal_state = "" + ic = "1--00" + list_of_steady_states = df.values.tolist() + # Find the number of unreachable_steady_states + for steady_state in list_of_steady_states: + current_goal_state = ','.join(str(v) for v in steady_state) + current_goal_state = current_goal_state.replace(',', '') + current_goal_state = current_goal_state.strip() + path = PyBoolNet.StateTransitionGraphs.best_first_reachability(primes, InitialSpace=ic, + GoalSpace=current_goal_state) + if path: + # for x in path: + # print(x) + print("A path was found for steady state: " + current_goal_state) + else: + print("No path was found for steady state: " + current_goal_state) + + return len(df) - unreachable_steady_states diff --git a/BoolODE/post_processing.py b/BoolODE/post_processing.py index 7f61bb8..a163000 100644 --- a/BoolODE/post_processing.py +++ b/BoolODE/post_processing.py @@ -6,6 +6,8 @@ from sklearn.cluster import KMeans from sklearn.manifold import TSNE import matplotlib.pyplot as plt +from BoolODE import compare_steady_states + def genSamples(opts): """ @@ -14,7 +16,7 @@ def genSamples(opts): """ numclusters = opts['nClusters'] num_simulations = opts['num_cells'] - + if numclusters > 1: clusterdf = pd.read_csv(opts['outPrefix'] + '/ClusterIds.csv', index_col=0) @@ -22,8 +24,8 @@ def genSamples(opts): if opts['sample_size'] > num_simulations: print('sample_size should be less than num of experiments') sample_size = num_simulations - - df = pd.read_csv(opts['outPrefix'] + '/simulations/E0.csv',index_col=0) + + df = pd.read_csv(opts['outPrefix'] + '/simulations/E0.csv', index_col=0) maxtime = len(df.columns) generatedPaths = [] @@ -32,43 +34,43 @@ def genSamples(opts): # Beeline/inputs/DYN-LI-500-1/... outfpath = opts['outPrefix'] + '/' + opts['name'] + '-' + str(sample_size) + '-' + str(did) generatedPaths.append(outfpath) - + if not os.path.exists(outfpath): print(outfpath, "does not exist, creating it...") os.makedirs(outfpath) # Create cell ids simids = np.random.choice(range(num_simulations), size=sample_size, replace=False) - fids = ['E'+ str(sid) + '.csv' for sid in simids] - timepoints = np.random.choice(range(1,maxtime), size=sample_size) + fids = ['E' + str(sid) + '.csv' for sid in simids] + timepoints = np.random.choice(range(1, maxtime), size=sample_size) min_t = min(timepoints) max_t = max(timepoints) - pts = [(t - min_t)/(max_t - min_t) for t in timepoints] - cellids = ['E' + str(sid) + '_' + str(t) for sid, t in zip(simids, timepoints)] + pts = [(t - min_t) / (max_t - min_t) for t in timepoints] + cellids = ['E' + str(sid) + '_' + str(t) for sid, t in zip(simids, timepoints)] # Read simulations from input dataset #psetid # to build a sample sample = [] for fid, cid in tqdm(zip(fids, cellids)): - df = pd.read_csv( opts['outPrefix'] + '/simulations/' + fid, index_col=0) + df = pd.read_csv(opts['outPrefix'] + '/simulations/' + fid, index_col=0) df.sort_index(inplace=True) sample.append(df[cid].to_frame()) - sampledf = pd.concat(sample,axis=1) + sampledf = pd.concat(sample, axis=1) sampledf.to_csv(outfpath + '/ExpressionData.csv') ## Read refNetwork.csv refdf = pd.read_csv(opts['outPrefix'] + '/refNetwork.csv') - refdf.to_csv(outfpath + '/refNetwork.csv',index=False) + refdf.to_csv(outfpath + '/refNetwork.csv', index=False) if numclusters == 1: ptdf = pd.DataFrame(np.array(pts), - index=pd.Index(cellids),columns = ['PseudoTime']) + index=pd.Index(cellids), columns=['PseudoTime']) else: subsetcluster = clusterdf.loc[[cid.split('_')[0] for cid in cellids]] ptdf = pd.DataFrame(index=pd.Index(cellids), - columns = ['PseudoTime' + str(1+i) for i in range(numclusters)]) + columns=['PseudoTime' + str(1 + i) for i in range(numclusters)]) for (cid, row), pt in zip(ptdf.iterrows(), pts): - ptdf.loc[cid]['PseudoTime' +\ - str(int(subsetcluster.loc[cid.split('_')[0]]['cl']) + 1)] = round(pt,5) - ptdf.to_csv(outfpath + '/PseudoTime.csv',na_rep='NA') + ptdf.loc[cid]['PseudoTime' + \ + str(int(subsetcluster.loc[cid.split('_')[0]]['cl']) + 1)] = round(pt, 5) + ptdf.to_csv(outfpath + '/PseudoTime.csv', na_rep='NA') return generatedPaths - + def genDropouts(opts): """Induce drop out in the simulated scRNAseq datasets. @@ -86,30 +88,30 @@ def genDropouts(opts): refDF = pd.read_csv(opts['refNet'], index_col=0) ## Generate output path for the dropout datasets - path = opts['outPrefix'] #+str(ncells) - path += '-' + str(int(100*dropoutCutoffs))+ '-' + str(opts['drop_prob']) + path = opts['outPrefix'] # +str(ncells) + path += '-' + str(int(100 * dropoutCutoffs)) + '-' + str(opts['drop_prob']) if not os.path.exists(path): os.makedirs(path) - + # Sample cells DropOutDF = expDF.copy() - #DropOutDF = DropOutDF.sample(n=ncells, axis='columns') - #cellIDs = DropOutDF.columns - + # DropOutDF = DropOutDF.sample(n=ncells, axis='columns') + # cellIDs = DropOutDF.columns + # copy over PT and refNetwork files refDF.to_csv(path + '/refNetwork.csv') - PTDF.to_csv(path+'/PseudoTime.csv') - + PTDF.to_csv(path + '/PseudoTime.csv') + # Drop-out genes if they are less than the # percentile value @ "dc" with 50% chance if dropoutCutoffs != 0: - quantileExp = expDF.quantile(q = dropoutCutoffs, axis = 'columns') + quantileExp = expDF.quantile(q=dropoutCutoffs, axis='columns') for idx, row in tqdm(expDF.iterrows()): for col in expDF.columns: if row[col] < quantileExp.loc[idx]: cointoss = np.random.random() if cointoss < opts['drop_prob']: - DropOutDF.loc[idx,col] = 0.0 + DropOutDF.loc[idx, col] = 0.0 DropOutDF.to_csv(path + '/ExpressionData.csv') @@ -118,22 +120,23 @@ def doDimRed(opts): """ Carry out dimensionality reduction """ - ExpDF = pd.read_csv(opts['expr'],index_col=0, header = 0) - ptDF = pd.read_csv(opts['pseudo'],index_col=0, header = 0) + ExpDF = pd.read_csv(opts['expr'], index_col=0, header=0) + ptDF = pd.read_csv(opts['pseudo'], index_col=0, header=0) perplexity = opts['perplexity'] print(perplexity) print("Computing TSNE...") - DimRedRes = TSNE(n_components = 2, perplexity = perplexity).fit_transform(ExpDF.T) - - DimRedDF = pd.DataFrame(DimRedRes,columns=['dim1','dim2'], + DimRedRes = TSNE(n_components=2, perplexity=perplexity).fit_transform(ExpDF.T) + + DimRedDF = pd.DataFrame(DimRedRes, columns=['dim1', 'dim2'], index=pd.Index(list(ExpDF.columns))) - DimRedDF.loc[:,'pt'] = ptDF.min(axis='columns') - DimRedDF.to_csv(str(opts['expr'].parent) + '/tsne' + str(perplexity)+'.tsv', sep='\t') + DimRedDF.loc[:, 'pt'] = ptDF.min(axis='columns') + DimRedDF.to_csv(str(opts['expr'].parent) + '/tsne' + str(perplexity) + '.tsv', sep='\t') plt.figure() plt.scatter(DimRedDF.dim1, DimRedDF.dim2, c=DimRedDF.pt) - plt.savefig(str(opts['expr'].parent)+'/tsne' + str(perplexity) + '.png') + plt.savefig(str(opts['expr'].parent) + '/tsne' + str(perplexity) + '.png') plt.close() - + + def computeSSPT(opts): ''' *Author: Aditya Pratapa* @@ -146,20 +149,20 @@ def computeSSPT(opts): E.g., Bifurcating: k=3 (1 initial and 2 terminal) E.g., Trifurcating: k=4 (1 initial and 3 terminal) ''' - ExpDF = pd.read_csv(opts['expr'],index_col=0, header = 0) - ptDF = pd.read_csv(opts['pseudo'],index_col=0, header = 0) + ExpDF = pd.read_csv(opts['expr'], index_col=0, header=0) + ptDF = pd.read_csv(opts['pseudo'], index_col=0, header=0) nClust = opts['nClusters'] outPath = opts['outPrefix'] perplexity = opts['perplexity'] noEnd = opts['noEnd'] - + if nClust == 1: # Return simulation time as PseduoTime - ptDF.loc[ExpDF.columns].to_csv(outPath+"/PseudoTime.csv", columns =['Time']) - + ptDF.loc[ExpDF.columns].to_csv(outPath + "/PseudoTime.csv", columns=['Time']) + else: ### Compute PseudoTime ordering using slingshot - + # Step-1: Compute dimensionality reduction. This is done in doDimRed(). # Currently only does TSNE # TODO: Add PCA @@ -168,13 +171,12 @@ def computeSSPT(opts): if not Path(settings['expr'].parent / 'tsne.tsv').is_file(): # First compute tSNE if this hasn't been done doDimRed(opts) - DimRedDF = pd.read_csv(settings['expr'].parent / 'tsne.tsv',sep='\t') - + DimRedDF = pd.read_csv(settings['expr'].parent / 'tsne.tsv', sep='\t') + # Step-3: Compute kMeans clustering - DimRedDF.loc[:,'cl'] = KMeans(n_clusters = nClust).fit(ExpDF.T).labels_ + DimRedDF.loc[:, 'cl'] = KMeans(n_clusters=nClust).fit(ExpDF.T).labels_ # Identify starting cluster - # Step-4: Identify cells corresponding to initial and final states # Cells in initial states are identified from cluster with smallest # mean experimental time @@ -182,35 +184,35 @@ def computeSSPT(opts): startClust = DimRedDF.groupby('cl').mean()['pt'].idxmin() endClust = ','.join([str(ix) for ix in DimRedDF.groupby('cl').mean()['pt'].index if ix != startClust]) startClust = str(startClust) - + # Step-5: Create a temporary directory and write files necessary to run slingshot - os.makedirs(outPath, exist_ok = True) + os.makedirs(outPath, exist_ok=True) - DimRedDF.to_csv(outPath + '/rd.tsv', columns = ['dim1','dim2'],sep='\t') - DimRedDF.to_csv(outPath + '/cl.tsv', columns = ['cl'],sep='\t') + DimRedDF.to_csv(outPath + '/rd.tsv', columns=['dim1', 'dim2'], sep='\t') + DimRedDF.to_csv(outPath + '/cl.tsv', columns=['cl'], sep='\t') if noEnd: - cmdToRun= " ".join(["docker run --rm -v", str(Path.cwd()) + "/" +outPath +"/:/data/temp", - "slingshot:base /bin/sh -c \"Rscript data/run_slingshot.R", - "--input=/data/temp/rd.tsv --input-type=matrix", - "--cluster-labels=/data/temp/cl.tsv", - "--start-clus="+startClust+'\"']) + cmdToRun = " ".join(["docker run --rm -v", str(Path.cwd()) + "/" + outPath + "/:/data/temp", + "slingshot:base /bin/sh -c \"Rscript data/run_slingshot.R", + "--input=/data/temp/rd.tsv --input-type=matrix", + "--cluster-labels=/data/temp/cl.tsv", + "--start-clus=" + startClust + '\"']) else: - cmdToRun= " ".join(["docker run --rm -v", str(Path.cwd())+ "/" +outPath +"/:/data/temp", - "slingshot:base /bin/sh -c \"Rscript data/run_slingshot.R", - "--input=/data/temp/rd.tsv --input-type=matrix", - "--cluster-labels=/data/temp/cl.tsv", - "--start-clus="+startClust, "--end-clus="+endClust+'\"']) + cmdToRun = " ".join(["docker run --rm -v", str(Path.cwd()) + "/" + outPath + "/:/data/temp", + "slingshot:base /bin/sh -c \"Rscript data/run_slingshot.R", + "--input=/data/temp/rd.tsv --input-type=matrix", + "--cluster-labels=/data/temp/cl.tsv", + "--start-clus=" + startClust, "--end-clus=" + endClust + '\"']) print(cmdToRun) os.system(cmdToRun) # Do this only for the first file - tn = pd.read_csv(outPath+"/rd.tsv", - header = 0, index_col =None, sep='\t') + tn = pd.read_csv(outPath + "/rd.tsv", + header=0, index_col=None, sep='\t') - cl = pd.read_csv(outPath+"/cl.tsv", - header = 0, index_col =0, sep='\t') + cl = pd.read_csv(outPath + "/cl.tsv", + header=0, index_col=0, sep='\t') - curveFile = open(outPath+"/curves.csv","r") + curveFile = open(outPath + "/curves.csv", "r") curveLst = [] lneCnt = 0 for line in curveFile: @@ -218,60 +220,59 @@ def computeSSPT(opts): lneCnt += 1 tn['cl'] = cl.values - tn.columns = ['CellID','dim1','dim2','kMeans'] + tn.columns = ['CellID', 'dim1', 'dim2', 'kMeans'] tn.index = tn.CellID f, axes = plt.subplots(2, 2, figsize=(7.5, 7.5)) # Plot slingshot pseudotime # and original clusters - detPT = pd.read_csv(outPath+"/SlingshotPT.csv", - header = 0, index_col = 0) + detPT = pd.read_csv(outPath + "/SlingshotPT.csv", + header=0, index_col=0) print() colNames = detPT.columns for colName in colNames: # Select cells belonging to each pseudotime trajectory index = detPT[colName].index[detPT[colName].notnull()] - tn.loc[index,colName] = detPT.loc[index,colName] + tn.loc[index, colName] = detPT.loc[index, colName] - - sns.scatterplot(x='dim1',y='dim2', - data = tn.loc[index], hue = colName, - palette = "viridis", - ax = axes[1][0]) + sns.scatterplot(x='dim1', y='dim2', + data=tn.loc[index], hue=colName, + palette="viridis", + ax=axes[1][0]) plt.legend([]) for line in range(0, lneCnt, 2): - sns.lineplot(x= curveLst[line+1],y=curveLst[line], - color = "k", ax = axes[1][0]) + sns.lineplot(x=curveLst[line + 1], y=curveLst[line], + color="k", ax=axes[1][0]) - sns.scatterplot(x='dim1',y='dim2', - data = tn, hue = 'kMeans', - palette = "Set1", - ax = axes[1][1]) + sns.scatterplot(x='dim1', y='dim2', + data=tn, hue='kMeans', + palette="Set1", + ax=axes[1][1]) # Plot deterministic pseduotime # and original clusters - detPT = pd.read_csv(outPath+"/PseudoTime.csv", - header = 0, index_col = 0) + detPT = pd.read_csv(outPath + "/PseudoTime.csv", + header=0, index_col=0) colNames = detPT.columns for idx in range(len(colNames)): # Select cells belonging to each pseudotime trajectory colName = colNames[idx] index = detPT[colName].index[detPT[colName].notnull()] - tn.loc[index,'Original'] = int(idx) + tn.loc[index, 'Original'] = int(idx) tn['ExpTime'] = detPT.min(axis='columns') - sns.scatterplot(x='dim1',y='dim2', - data = tn, hue = 'ExpTime', - palette = "viridis", - ax = axes[0][0]) + sns.scatterplot(x='dim1', y='dim2', + data=tn, hue='ExpTime', + palette="viridis", + ax=axes[0][0]) - sns.scatterplot(x='dim1',y='dim2', - data = tn, hue = 'Original', - palette = "Set1", - ax = axes[0][1]) + sns.scatterplot(x='dim1', y='dim2', + data=tn, hue='Original', + palette="Set1", + ax=axes[0][1]) axes[0][0].get_legend().remove() axes[0][0].title.set_text('Experiment Time') @@ -286,8 +287,22 @@ def computeSSPT(opts): f.tight_layout() - tn.to_csv(outPath+"/Updated_rd.tsv", + tn.to_csv(outPath + "/Updated_rd.tsv", sep='\t') - plt.savefig(outPath+"/SlingshotOutput.png") - os.system("rm -rf temp/") - + plt.savefig(outPath + "/SlingshotOutput.png") + os.system("rm -rf temp/") + + +def compare_ss(opts): + """ + Performs steady state estimation on ExpressionData + with compare_steady_states.py, which + currently performs DBSCAN to determine steady states and can be + optionally compared with the steady states calculated by PyBoolNet as + specified by opts['perform_PyBoolNet'] + """ + expressionDataFileLocation = opts['expressionDataFileLocation'] + outputPath = opts['outputPath'] + perform_pyboolnet = opts['perform_PyBoolNet'] + modelPath = opts['modelPath'] + compare_steady_states.compare_ss(expressionDataFileLocation, outputPath, perform_pyboolnet, modelPath) diff --git a/README.md b/README.md index b7b200d..4c05446 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,19 @@ Git Repo for converting Boolean models to ODE models and performing stochastic s Find the documentation for BoolODE at [https://murali-group.github.io/Beeline/BoolODE.html](https://murali-group.github.io/Beeline/BoolODE.html). +## Installation + +1) Clone the repository to get a copy of all BoolODE files on your personal machine. +In the directory where you want the BoolODE files, use the following command: +`git clone https://github.com/Murali-group/BoolODE.git` + +2) Install the required software packages with the following command: +`pip install -r requirements.txt` + +3) If you want to perform the optional post_processing of comparing DBSCAN with PyBoolNet (compare_pyboolnet:True), follow +the instructions to install the requirements for PyBoolNet here: +`https://pyboolnet.readthedocs.io/` + ## Usage `python boolode.py --config path/to/config.yaml` @@ -85,3 +98,86 @@ More more details, please see the supplementary material and Online Methods of t If you use BoolODE in your research, please cite: Aditya Pratapa, Amogh Jalihal, Jeffrey Law, Aditya Bharadwaj, and T M Murali. Benchmarking algorithms for gene regulatory network inference from single-cell transcriptomic data, Nature Methods, (2020). https://doi.org/10.1038/s41592-019-0690-6 + +# Optional clustering techniques + +## Perform_Clustering + +Performs clustering on an ExpressionData file with various cluster and visualization options. +Clustering is used to predict the number of steady states in the Boolean Model expressed in the ExpressionData file. A brief explanation of the DBSCAN clustering technique is provided in the Overview section + +## Usage + +## Perform DBSCAN and create a DBSCAN_ClusterIDs.csv file + +`python perform_clustering.py -f path/to/ExpressionData.csv -d` + +## Perform k-means and creates an elbow_visualization.png file for clusters 2 to 11 + +`python perform_clustering.py -f path/to/ExpressionData.csv -e` + +## Perform k-means silhouette and create a silhouette_visualization.png file for clusters 2 to 5 +(Note: The default, without -u (upper bound) specified, is clusters 2 to 11) + +`python perform_clustering.py -f path/to/ExpressionData.csv -s -u 5` + +Note: any combination of -s, -e, and -d can be specified, for example: + +`python perform_clustering.py -f path/to/ExpressionData.csv -d -e -s` + + +## Inputs + +-f: The ExpressionData file path +Required +The ExpressionData file is expected to be the same format as generated by BoolODE + +-d: DBSCAN clustering is requested +Not required + +-e: k-means elbow is requested +Not required + +-s: k-means silhouette is requested +Not required + +-u: upper bound for silhouette plots, which must be greater than 2 +Not required + + +## Outputs + +-d: DBSCAN clustering is requested +The DBSCAN clustering outputs a DBSCAN_ClusterIDs.csv file, where each cell has its own cluster assigned by DBSCAN. Note: noise found by DBSCAN are put in a cluster for visualization purposes. + +-e: k-means elbow is requested +The k-means elbow generates an elbow_visualization.png file with the elbow identified. + +-s: k-means silhouette is requested +The k-means silhouette generates a silhouette_visualization.png file where the number of clusters is 2 until the number specified by -u. If no upper bound is specified by -u, the default upper bound is 11. + +Note: All outputs are generated in the same directory as the provided ExpressionData.csv + + +## OUTPUT NOTES + +## DBSCAN + +There is more work that needs to be done with DBSCAN, such as testing with lower simulation times and differing cell numbers. Additionally, DBSCAN is not suitable for datasets with varying density, and that is an analysis metric we want to include in the future (analyzing the density of the dataset and determining when DBSCAN is successful and what density works with particular models). + +## ELBOW + +The elbow method has one additional metric that can be implemented if the line defining the model type is changed from metric='calinski_harabasz' to metric='distortion’ in k_means_elbow(). We have set it to calinski_harabasz because that has yielded closer results to the expected value on average for the particular models that we tested. + +## SILHOUETTE + +The silhouette_visualization.png file needs to be visually analyzed to determine the appropriate number of clusters. The correct estimated cluster number, according to Silhouette, corresponds to the plot where there that has the least number of negative coefficient values and has the most uniformity in the thickness of the clusters. + + +## Overview + +The perform_clustering script is designed to analyze ExpressionData files generated by BoolODE to statistically analyze the data to try and cluster it and predict the number of steady states that exist in the boolean model. It currently supports k-means elbow, k-means silhouette, and DBSCAN. + +## DBSCAN Description + +DBSCAN stands for Density-Based Spatial Clustering of Applications with Noise. It is useful for handling the data in ExpressionData files because of how it handles noise. DBSCAN works by looking at a point and a specified number of its neighbors at a certain distance to determine if they are similar enough to be grouped together in a cluster. In the end, DBSCAN outputs the estimated number of clusters that it found from the data. DBSCAN is used to estimate the number of steady states from simulated gene expression data from an ExpressionData file. diff --git a/boolode.py b/boolode.py index 7e41cd4..bb305d5 100644 --- a/boolode.py +++ b/boolode.py @@ -1,4 +1,3 @@ -import yaml import argparse import BoolODE as bo diff --git a/config-files/beeling-inputs-boolean.yaml b/config-files/beeline-inputs-boolean.yaml similarity index 100% rename from config-files/beeling-inputs-boolean.yaml rename to config-files/beeline-inputs-boolean.yaml diff --git a/config-files/config.yaml b/config-files/config.yaml index ad85686..6baeca8 100644 --- a/config-files/config.yaml +++ b/config-files/config.yaml @@ -2,67 +2,72 @@ global_settings: model_dir: "data" output_dir: "Synthetic-H" do_simulations: True - do_post_processing: False - modeltype: 'heaviside' -# modeltype: 'hill' + do_post_processing: True + modeltype: 'hill' +# modeltype: 'heaviside' jobs: - - name: "dyn-LL-1" - model_definition: "dyn-linear-long.txt" - model_initial_conditions: "dyn-linear-long_ics.txt" - simulation_time: 9 - num_cells: 300 - do_parallel: True - sample_cells: False - # - name: "dyn-LI-1" - # model_definition: "dyn-linear.txt" - # model_initial_conditions: "dyn-linear_ics.txt" - # simulation_time: 7 - # num_cells: 200 - # do_parallel: True - # sample_cells: False + - name: dyn-LL-1 + model_definition: "dyn-linear-long.txt" + model_initial_conditions: "dyn-linear-long_ics.txt" + simulation_time: 9 + num_cells: 300 + do_parallel: False + sample_cells: False - # - name: "dyn-BF-1" - # model_definition: "dyn-bifurcating.txt" - # model_initial_conditions: "dyn-bifurcating_ics.txt" - # simulation_time: 5 - # num_cells: 500 - # do_parallel: True - # sample_cells: False - # nClusters: 2 +# - name: "dyn-LI-1" +# model_definition: "dyn-linear.txt" +# model_initial_conditions: "dyn-linear_ics.txt" +# simulation_time: 7 +# num_cells: 200 +# do_parallel: False +# sample_cells: False - # - name: "dyn-TF-1" - # model_definition: "dyn-trifurcating.txt" - # model_initial_conditions: "dyn-trifurcating_ics.txt" - # simulation_time: 8 - # num_cells: 500 - # do_parallel: True - # sample_cells: False - # nClusters: 3 +# - name: "dyn-BF-1" +# model_definition: "dyn-bifurcating.txt" +# model_initial_conditions: "dyn-bifurcating_ics.txt" +# simulation_time: 5 +# num_cells: 500 +# do_parallel: False +# sample_cells: False +# nClusters: 2 + +# - name: "dyn-TF-1" +# model_definition: "dyn-trifurcating.txt" +# model_initial_conditions: "dyn-trifurcating_ics.txt" +# simulation_time: 8 +# num_cells: 500 +# do_parallel: False +# sample_cells: False +# nClusters: 3 # - name: "dyn-CY-1" # model_definition: "dyn-cycle.txt" # model_initial_conditions: "dyn-cycle_ics.txt" # simulation_time: 8 # num_cells: 500 - # do_parallel: True + # do_parallel: False # sample_cells: False # nClusters: 1 post_processing: - Dropouts: - - dropout: False - nCells: 1000 - - # - dropout: False - # nCells: 100 - # drop_cutoff: 0.5 - # drop_prob: 0.5 - - # - dropout: False - # nCells: 100 - # drop_cutoff: 0.7 - # drop_prob: 0.7 +# Dropouts: +# - dropout: False +# nCells: 1000 +# +# - dropout: False +# nCells: 100 +# drop_cutoff: 0.5 +# drop_prob: 0.5 +# +# - dropout: False +# nCells: 100 +# drop_cutoff: 0.7 +# drop_prob: 0.7 + + CompareSteadyStates: + - outputPath: /Users/ameliawhitehead/Desktop/boolbool/BoolODE/config-files + perform_PyBoolNet: True - # Slingshot: - # - perplexity: 400 +# Slingshot: +# - perplexity: 400 diff --git a/config-files/example-config.yaml b/config-files/example-config.yaml index 5fc7da7..b545e4c 100644 --- a/config-files/example-config.yaml +++ b/config-files/example-config.yaml @@ -170,3 +170,14 @@ post_processing: ## docker on your machine. # Slingshot: # - perplexity: 200 + + + # Performs steady state comparison using DBSCAN on BoolODE ExpressionData output + # outputPath is where the user would like output files will generate + # For steady state calculation from PyBoolNet, set perform_PyBoolNet to True, + # the default is false. + # Note: If you set perform_PyBoolNet to true, you need to follow the + # PyBoolNet installation guide provided in README.MD + # CompareSteadyStates: + # - outputPath: /Users/userName/Desktop/BoolODE/data/ssComparisonOutputFolder + # perform_PyBoolNet: True diff --git a/config-files/example_gen_config_input.txt b/config-files/example_gen_config_input.txt new file mode 100644 index 0000000..eece70c --- /dev/null +++ b/config-files/example_gen_config_input.txt @@ -0,0 +1,8 @@ +base_directory_path: path_where_you_want_the_config_files +name: "dyn-LL-1" +model_definition: "dyn-linear-long.txt" +model_initial_conditions: "dyn-linear-long_ics.txt" +simulation_times: 10, 12, 14, 16 +num_cells: 100, 200, 300 +CompareSteadyStates: True +perform_PyBoolNet: True diff --git a/config-files/generate_config_files.py b/config-files/generate_config_files.py new file mode 100644 index 0000000..ae38ad3 --- /dev/null +++ b/config-files/generate_config_files.py @@ -0,0 +1,158 @@ +""" + This script writes config files based on + a given range of cells and range of time values for a given model. + It creates a directory for each cell and time combination. + + Invoke this file as follows: + python generate_config_files.py path_of_input_file.txt + The example_gen_config_input.txt file is the format expected. + + @Author: Neel Patel (neelspatel999@vt.edu) + @Author: Amelia Whitehead (ameliafw@vt.edu) +""" + +import sys +import os + +base_directory_path = "" +model_name = "" +model_definition = "" +model_initial_conditions = "" +cells_list = [] +time_list = [] +CompareSteadyStates = False +perform_PyBoolNet = False + +# NOTE: Right now, we are only considering CompareSteadyStates +# by itself or along with perform_PyBoolNet as +# post_processing requests. We do not look for any other +# post_processing requests at the moment. +do_post_processing = False +def main(): + """ + Main caller function of the generate_config_files.py + """ + + for input_file in sys.argv[1:]: + file = open(input_file, "r") + lines = file.readlines() + + # Get information from input file. # + ########################################### + for line in lines: + line = line.strip() + colon_index = line.find(":") + if "base_directory_path:" in line: + global base_directory_path + base_directory_path += line[colon_index+1:].strip() + elif "name:" in line: + global model_name + model_name += line[colon_index+1:].strip() + elif "model_definition:" in line: + global model_definition + model_definition += line[colon_index+1:].strip() + elif "model_initial_conditions:" in line: + global model_initial_conditions + model_initial_conditions += line[colon_index+1:].strip() + elif "simulation_times:" in line: + times = line[colon_index+1:] + # Split the string up by commas + global time_list + time_list = times.split(", ") + elif "num_cells:" in line: + cells = line[colon_index+1:] + # Split the string up by commas + global cells_list + cells_list = cells.split(", ") + elif "CompareSteadyStates:" in line: + if "True" in line: + global CompareSteadyStates + CompareSteadyStates = True + elif "perform_PyBoolNet:" in line: + if "True" in line: + global perform_PyBoolNet + perform_PyBoolNet = True + ########################################### + generate_requested_config_files() + +def generate_requested_config_files(): + """ + Creates a directory named the model name provided. + Then creates subdirectories for each cell and time combination. + Writes a config file in each directory created. + """ + + # Create the directory where all the files + # will be located + base_path = os.path.join(base_directory_path, model_name.strip('"')) + os.mkdir(base_path) + for cell_number in cells_list: + # Create a directory for that cell number + cell_number = cell_number.strip() + cell_name = "Cells=" + cell_name += cell_number + cell_specific_path = os.path.join(base_path, cell_name) + os.mkdir(cell_specific_path) + for time_amount in time_list: + time_amount = time_amount.strip() + time_point = "Time=" + time_point += time_amount + time_specific_path = os.path.join(cell_specific_path, time_point) + os.mkdir(time_specific_path) + + ## Write all the information in a config file. + config_file_path = os.path.join(time_specific_path, "config.yaml") + config_file = open(config_file_path, "w") + config_file.write("global_settings:\n") + config_file.write(" model_dir: data\n") + info_to_write1 = " output_dir: " + info_to_write1 += time_specific_path + info_to_write1 += "\n" + config_file.write(info_to_write1) + config_file.write(" do_simulations: True\n") + if CompareSteadyStates: + config_file.write(" do_post_processing: True\n") + else: + config_file.write(" do_post_processing: False\n") + config_file.write(" modeltype: 'hill'\n") + config_file.write("jobs:\n") + info_to_write = " - name: " + info_to_write += model_name + info_to_write += "\n" + config_file.write(info_to_write) + info_to_write = " model_definition: " + info_to_write += model_definition + info_to_write += "\n" + config_file.write(info_to_write) + if model_initial_conditions != "" || model_initial_conditions != "None": + info_to_write = " model_initial_conditions: " + info_to_write += model_initial_conditions + info_to_write += "\n" + config_file.write(info_to_write) + info_to_write = " simulation_time: " + info_to_write += time_amount + info_to_write += "\n" + config_file.write(info_to_write) + info_to_write = " num_cells: " + info_to_write += cell_number + info_to_write += "\n" + config_file.write(info_to_write) + config_file.write(" do_parallel: False\n") + config_file.write(" sample_cells: False\n") + config_file.write("post_processing:\n") + if CompareSteadyStates: + config_file.write(" CompareSteadyStates:\n") + info_to_write = " - outputPath: " + info_to_write += time_specific_path + info_to_write += "\n" + config_file.write(info_to_write) + if perform_PyBoolNet: + config_file.write(" perform_PyBoolNet: True\n") + else: + config_file.write(" perform_PyBoolNet: False\n") + +if __name__ == "__main__": + """ + Calls the main method + """ + main() diff --git a/find_and_run_configs.py b/find_and_run_configs.py new file mode 100644 index 0000000..e25dd33 --- /dev/null +++ b/find_and_run_configs.py @@ -0,0 +1,49 @@ +import argparse +import BoolODE as bo +import os + +def get_input_info(): + """ + Gets the root_directory path from the user. + """ + + parser = argparse.ArgumentParser( + description='Run BoolODE to generate synthetic scRNAseq data for all' + 'config files located within the given directory and ' + 'its subdirectories.\n ' + 'Specify the main directory where all config files are' + 'contained. This is to be used in conjunction with ' + 'generate_config_files.py.') + + parser.add_argument('--root_directory', default="None", + help='Provide the root directory path' + 'created by generate_config_files.py. ' + 'where its subdirectories contain config files.') + + return parser.parse_args().root_directory + + +def main(): + """ + Main caller function of the find_and_run_configs.py + """ + + root_directory = get_input_info() + if root_directory == "None": + print("Please provide a valid root directory path with --root_directory.") + else: + print(root_directory) + + for root, subdirectories, files in os.walk(root_directory): + for file in files: + if file == "config.yaml": + with open(os.path.join(root, file), 'r') as conf: + boolodejobs = bo.ConfigParser.parse(conf) + boolodejobs.execute_jobs() + print('Done with running all config files.') + +if __name__ == '__main__': + """ + Calls the main function + """ + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8c10eb6..2d7dcb9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ tqdm==4.31.1 seaborn==0.9.0 pandas==0.24.2 scikit-learn==0.21.3 +yellowbrick==1.3.post1 +kneed==0.7.0 \ No newline at end of file diff --git a/scripts/.DS_Store b/scripts/.DS_Store new file mode 100644 index 0000000..7351eae Binary files /dev/null and b/scripts/.DS_Store differ diff --git a/scripts/perform_clustering.py b/scripts/perform_clustering.py new file mode 100644 index 0000000..cfbd024 --- /dev/null +++ b/scripts/perform_clustering.py @@ -0,0 +1,252 @@ +""" + This script performs k-means and DBSCAN clustering + algorithms on a provided ExpressionData.CSV file to estimate + the number of steady states for a particular boolean model. + + @Author: Neel Patel (neelspatel999@vt.edu) + @Author: Madeline Shaklee (mshaklee@umassd.edu) + @Author: Amelia Whitehead (ameliafw@vt.edu) +""" +import numpy as np +import argparse +import pandas + +parser = argparse.ArgumentParser(description='Takes the ExpressionData.csv filepath ' + 'and various options to cluster and visualize the data.') + +parser.add_argument('-f', '--ExpressionDataFilePath', required=True, type=str, + help='Specify the path of the ExpressionData file.') +parser.add_argument('-e', '--Elbow', action='store_true', + help='Input -e if you want to perform a k-means elbow visualization.') +parser.add_argument('-s', '--Silhouette', action='store_true', + help='Input -s if you want to perform a k-means silhouette visualization.') +parser.add_argument('-d', '--DBSCAN', action='store_true', + help='Input -d if you want to perform DBSCAN clustering to obtain a DBSCAN_ClusterIDs.csv file.') +parser.add_argument('-u', '--SilhouetteUpperBound', type=int, + help='Input -u if you want to specify the upper bound (inclusive) for the number of clusters to consider ' + 'in silhouette plots. The default is 11. You must use this option along with -s, ' + 'and you must provide a number greater than 2. NOTE: We suggest starting' + ' with low numbers (less than 12) because the image ' + 'will increase in size as you increase the number of silhouettes to generate') + + +elbow_value = parser.parse_args().Elbow +silhouette_value = parser.parse_args().Silhouette +dbscan_value = parser.parse_args().DBSCAN +silhouette_upperbound = parser.parse_args().SilhouetteUpperBound +expression_data_location = parser.parse_args().ExpressionDataFilePath + +# Defining the output path to be the same path as +# ExpressionData for any output file generated +output_file_path = expression_data_location[0:expression_data_location.rindex('/') + 1] + +"""" + Iteratively performs k-means clustering, testing k values 2 ≤ k ≤ upper_bound, + and generates a silhouette plot of each. + + @Param: upper_bound is the number of maximum number of clusters to + include in the silhouette visualization. This can be specified + by the user by -u or --SilhouetteUpperBound. The default + is 11. upper_bound must be k ≥ 3 + + @Precondition: The number of cells in the ExpressionData file + must be greater than upper_bound + + @Note: The package only allows for k ≥ 2 analyses + + @Citation: https://www.scikit-yb.org/en/latest/api/cluster/silhouette.html +""" +def k_means_silhouette(upper_bound): + from sklearn.cluster import KMeans + from yellowbrick.cluster import SilhouetteVisualizer + import matplotlib.pyplot as plt + + # ExpressionData is a matrix, where rows are genes and columns are cells + expression_df = pandas.read_csv(expression_data_location, sep=',', index_col=0) + + # Performs transpose for later clustering on the samples. + # Later, we cluster the cell columns of ExpressionData + expression_data_transpose = np.array(expression_df.T.to_numpy().tolist()) + + fig, axs = plt.subplots(upper_bound - 1, figsize=(15, upper_bound*5)) + plot_count = 0 + for cluster_number in range(2, upper_bound + 1): + k_means = KMeans(n_clusters=cluster_number) + + visualizer = SilhouetteVisualizer(k_means, axs[plot_count]) + visualizer.fit(expression_data_transpose) + axs[plot_count].set_title("Silhouette of " + str(cluster_number) + " Clusters") + axs[plot_count].set_xlabel('silhouette coefficient values') + axs[plot_count].set_ylabel('cluster label') + plot_count = plot_count + 1 + + visualizer.show(outpath=output_file_path + "silhouette_visualization.png") + print("Silhouette analyses generated a silhouette_visualization.png file.") + +"""" + Iteratively performs k-means clustering, testing k values 2 ≤ k ≤ 11, + and generates an elbow plot. + + @Precondition: Number of cells in the ExpressionData file + must be greater than 11 + + @Note: The package only allows for k ≥ 2 analyses + + @Citation: https://www.scikit-yb.org/en/latest/api/cluster/elbow.html +""" +def k_means_elbow(): + from sklearn.cluster import KMeans + from yellowbrick.cluster import KElbowVisualizer + + # ExpressionData is a matrix, where rows are genes and columns are cells + expression_df = pandas.read_csv(expression_data_location, sep=',', index_col=0) + + # Performs transpose for later clustering on the samples. + # We cluster the cell columns of ExpressionData + expression_data_transpose = np.array(expression_df.T.to_numpy().tolist()) + + model = KMeans() + + # KElbowVisualizer has 3 different metrics + # (distortion, silhouette, and calinski_harabasz) + # The calinski_harabasz score computes the ratio of + # dispersion between and within clusters. Distortion could also + # be used for this method. + visualizer = KElbowVisualizer(model, k=(2, 12), metric='calinski_harabasz', timings=False) + + visualizer.fit(expression_data_transpose) + visualizer.show(outpath=output_file_path + "elbow_visualization.png") + print("Elbow analysis generated an elbow_visualization.png file.") + +""" + Performs DBSCAN on the ExpressionData file and generates a DBSCAN_ClusterIDs.csv file, which specifies + which cell belongs to what cluster, according to DBSCAN. All noise points are grouped together into + a separate cluster for visualization purposes. + + @Precondition: The number of samples (cells) needs to be greater than two + times the number of genes + + @Note: DBSCAN does not work well with datasets of varying density + + @Citations: https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py + https://medium.com/@tarammullin/dbscan-parameter-estimation-ff8330e3a3bd +""" +def dbscan_clustering(): + from sklearn.cluster import DBSCAN + from sklearn.neighbors import NearestNeighbors + from kneed import KneeLocator + + # ExpressionData is a matrix, where rows are genes and columns are cells + expression_df = pandas.read_csv(expression_data_location, sep=',', index_col=0) + + # Performs transpose for later clustering on the samples. + # We cluster the cell columns of ExpressionData + expression_data_transpose = np.array(expression_df.T.to_numpy().tolist()) + + + # DBSCAN takes two parameters, MinSamples and Epsilon + # MinSamples is described now, and Epsilon will be described later + + # The MinSamples parameter is the fewest number of samples (cells) + # DBSCAN will put in a cluster. We assign MinSamples + # to two times the number of genes in ExpressionData + min_samples_num = len(expression_df.index)*2 + + + ############# Below is from Medium.com reference ############# + + # Compute nearest neighbors and distances + neighbors = NearestNeighbors(n_neighbors=min_samples_num) + + neighbors_fit = neighbors.fit(expression_data_transpose) + distances, indices = neighbors_fit.kneighbors(expression_data_transpose) + + # Sort distances in ascending order + distances = np.sort(distances, axis=0) + + ############# End of Medium.com reference ############# + + + # Find the average k-distances then plot to find the knee, + # which is the point of maximum curvature + y_values = _compute_average_distances(distances) + x_values = list(range(1,len(y_values) + 1)) + knee_locator = KneeLocator(x_values, y_values, S=1.0, curve='convex', direction='increasing') + maximum_curvature_position = round(knee_locator.knee, 20) + + # The Epsilon parameter of DBSCAN represents the upper bound (exclusive) + # distance between two points to be clustered together. + epsilon = y_values[maximum_curvature_position - 1] + + + ############# Below is from the scikit-learn reference ############# + + # Performs DBSCAN with the calculated parameters + db = DBSCAN(eps=epsilon, min_samples=min_samples_num).fit(expression_data_transpose) + + clusters_identifiers = db.fit_predict(expression_data_transpose) + core_samples_mask = np.zeros_like(db.labels_, dtype=bool) + core_samples_mask[db.core_sample_indices_] = True + labels = db.labels_ + + # Number of clusters in labels, ignoring noise if present. + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + n_noise_ = list(labels).count(-1) + + print("DBSCAN result:") + print('Estimated number of clusters: %d' % n_clusters_) + print('Estimated number of noise points: %d \n' % n_noise_) + + ############# End of scikit-learn reference ############# + + # Adding the cluster labels to the dataframe + expression_df.loc['cluster_labels', :] = clusters_identifiers + + # Write a new ClusterID file according to DBSCAN's clustering + cluster_values = expression_df.loc['cluster_labels'].tolist() + cluster_values = [int(number) for number in cluster_values] + cell_names = expression_df.columns.tolist() + dictionary = {'':cell_names, 'cl':cluster_values} + cluster_df = pandas.DataFrame(dictionary) + cluster_df.to_csv(output_file_path + 'DBSCAN_ClusterIDs.csv', index=False) + + print("DBSCAN analysis generated a DBSCAN_ClusterIDs.csv file.") + +""" + Takes a list of distance lists computed via k-nearest neighbors, and + outputs a list where each element is the average distance of the distance + list occupied in the respective ordinal location of the input list + + @Param: A list of lists, where each element contains a + list of distances computed via k-nearest neighbor. + + @Precondition: The distance_array has length greater than 0 + + @Returns: A list of distances, where each element is the + average distance of the distance list occupied + in the respective ordinal location of the input list +""" +def _compute_average_distances(distance_array): + denominator_of_average = len(distance_array[0]) + # Will be of length number of samples (cells) in the + # ExpressionData, once full + average_distances = [] + for distances in distance_array: + distance_sum = 0 + for distance_value in distances: + distance_sum = distance_sum + distance_value + average_distance = distance_sum / denominator_of_average + average_distances.append(average_distance) + return average_distances + +if __name__ == '__main__': + if elbow_value: + k_means_elbow() + if silhouette_value: + if silhouette_upperbound: + k_means_silhouette(silhouette_upperbound) + else: + # The default value is 11 + k_means_silhouette(11) + if dbscan_value: + dbscan_clustering() diff --git a/tutorial_perform_clustering.pdf b/tutorial_perform_clustering.pdf new file mode 100644 index 0000000..df6b16a Binary files /dev/null and b/tutorial_perform_clustering.pdf differ