Murali-group · SaraAmd · Mar 31, 2021 · Mar 31, 2021 · Apr 29, 2021 · May 4, 2021
diff --git a/SGRN/CNNCRunner.py b/SGRN/CNNCRunner.py
@@ -1,5 +1,6 @@
 import os
 import pandas as pd
+import pdb
 from pathlib import Path
 import numpy as np
 import networkx as nx
@@ -27,7 +28,7 @@
 import torch.utils.data as utils
 from SGRN.GAEHelpers import *
 import SGRN.GAERunner as GR
-
+from SGRN.makeInputs import *
 
 
 
@@ -107,8 +108,10 @@ def run(RunnerObj, fID):
 
     #exprDF = pd.read_csv(RunnerObj.inputDir.joinpath("normExp.csv"), header = 0, index_col =0)
     # NOTE: CNNC requires non-standarized expression data as input 
-    exprDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
-                                         header = 0, index_col = 0)
+    #exprDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
+    #                                     header = 0, index_col = 0)
+    exprDF, expr_genes = preprocess_expr(RunnerObj)
+    exprDF.index = exprDF.index.str.upper()
     posE = np.load(RunnerObj.inputDir.joinpath("posE.npy"))
     negE = np.load(RunnerObj.inputDir.joinpath("negE.npy"))
 
@@ -187,6 +190,7 @@ def run(RunnerObj, fID):
         # Compute features for negative examples because
         # storing the pre-computed negatives is prohibitive
 
+        # keyerror TBX4 nEdge[1] not inside exprDF
         trNeg =  np.zeros((len(train_negIdx), 32, 32))
         for idx in tqdm(range(len(train_negIdx))):
             edgeId = train_negIdx[idx]

diff --git a/SGRN/GAERunner.py b/SGRN/GAERunner.py
@@ -242,7 +242,6 @@ def val(pos_edge_index, neg_edge_index):
     outDF.to_csv(output_path, index=False, mode='a', header=not os.path.exists(output_path))
 
 
-
 def parseOutput(RunnerObj):
     if RunnerObj.name == 'GAE':
         algName = RunnerObj.params['encoder'] + '-' +RunnerObj.params['decoder']

diff --git a/SGRN/__init__.py b/SGRN/__init__.py
@@ -84,7 +84,13 @@ def __create_runners(self) -> Dict[int, List[Runner]]:
                     data['params'] = runner[1]
                     data['inputDir'] = Path.cwd().joinpath(self.input_settings.datadir.joinpath(dataset['name']))
                     data['exprData'] = dataset['exprData']
+                    data['delim'] = dataset['delim']
                     data['trueEdges'] = dataset['trueEdges']
+                    data['normalization'] = dataset['normalization']
+                    data['min_genes'] = dataset['min_genes']
+                    data['min_cells'] = dataset['min_cells']
+                    data['top_expr_genes'] = dataset['top_expr_genes']
+                    data['gtf_file'] = dataset['gtf_file']
                     data['kTrain'] = self.input_settings.kTrain
                     data['kTest'] = self.input_settings.kTest
                     data['kFold'] = self.input_settings.kFold

diff --git a/SGRN/makeInputs.py b/SGRN/makeInputs.py
@@ -1,8 +1,9 @@
-
 import os
 import pandas as pd
 from pathlib import Path
 import numpy as np
+import pdb
+import scanpy as sc
 import networkx as nx
 import random
 import math
@@ -12,6 +13,106 @@
 from tqdm import tqdm 
 from sklearn.model_selection import KFold
 
+def exprdata_stats(adata):
+    """ Printing Statistics of the single cell RNA sequence datasets including
+    # of genes, # of cells
+    """
+    print(f'Total number of cells: {adata.n_obs}')
+    print(f'Total number of genes: {adata.n_vars}')
+
+def read_in_chunks(fileobj, chunksize=1000000):
+    """
+    Reading in a file lazily in increments of 1,000,000 bytes 1 Megabytes
+    """
+    while True:
+        data = fileobj.read(chunksize)
+        if not data:
+            break
+        yield data
+
+def convert_transcripts_to_genes(RunnerObj):
+    """ Converting transcripts to genes from Ensembl
+    """
+    info_df = []
+    prev_line = ''
+    prev_line2 = ''
+    with open(os.path.join(RunnerObj.inputDir.joinpath(RunnerObj.gtf_file))) as f:
+        for piece in read_in_chunks(f):
+            piece = prev_line + piece
+            for line in piece.split('\n'):
+                if line == piece[piece.rfind('\n')+1:]:
+                    prev_line = line
+                    break
+                if 'gene_name' in line and 'transcript_id' in line:
+                    info = line[line.rfind('\t')+1:]
+                    tmp = {}
+                    for item in info.split('\"; '):
+                        res = item.split(' \"')
+                        if len(res) == 2:
+                            tmp[res[0]] = res[1]
+                    info_df.append([tmp['gene_name'],tmp['transcript_id']]) 
+    tran_gene = pd.DataFrame(info_df)
+    tran_gene.drop_duplicates(keep='first',inplace=True)
+    tran_gene.columns = ['gene_name','transcript']
+    tran_gene.set_index('transcript',inplace=True)
+    return tran_gene
+
+def transcription_factor_percentage(RunnerObj,expdf):
+    """ Number of Transcription Factors in the single cell RNA seq dataset
+    """
+    tf_path = '/home/kradja/supervised-grns/inputs/TFs'
+    tf_file = 'mouse-tfs.csv'
+    tf = pd.read_csv(os.path.join(tf_path,tf_file),sep=',')
+
+    tf_expdf = expdf[expdf.index.isin(tf.TF)]
+    print(f'Out of {len(expdf)} genes there are {len(tf_expdf)} Transcription factors')
+    pdb.set_trace()
+    return tf_expdf
+
+
+def preprocess_expr(RunnerObj):
+    """
+    Preprocessing single cell RNA sequencing data with inputs from the RunnerObj
+    """
+    ExpDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
+                                 header = 0, index_col = 0,sep = RunnerObj.delim)
+    adata = sc.read_csv(os.path.join(RunnerObj.inputDir.joinpath(RunnerObj.exprData))
+                                     ,delimiter = RunnerObj.delim)
+    adata = adata.transpose() # AnnData package needs the genes as columns and cells as rows
+    exprdata_stats(adata)
+
+    # Scanpy pre-processing filtering cells and genes and normlaization
+    if RunnerObj.normalization == '':
+        sc.pp.normalize_total(adata)
+    sc.pp.filter_cells(adata,min_genes = int(RunnerObj.min_genes))
+    sc.pp.filter_genes(adata,min_cells = int(RunnerObj.min_cells))
+    exprdata_stats(adata)
+    flavor = 'seurat'
+    if flavor == 'seurat' or flavor == 'cell_ranger':
+        sc.pp.log1p(adata)
+        sc.pp.highly_variable_genes(adata,flavor = flavor)
+    if flavor == 'seurat_v3':
+        sc.pp.highly_variable_genes(adata,flavor = flavor)
+
+    # Taking the top 'x' number of highly variable genes 
+    high_var = adata.var[adata.var.highly_variable == True].sort_values('dispersions_norm',ascending=False)
+    print(f'There are {len(high_var)} highly variable genes out of {len(adata.var)} genes')
+    high_var = high_var[:int(RunnerObj.top_expr_genes)]
+    adata_subset = adata[:, high_var.index]
+
+    # Converting the indices of high_var with gene from transcript
+    tran_gene = convert_transcripts_to_genes(RunnerObj)
+    tt = adata_subset.var.index.to_series()
+    ntt = tt.apply(lambda x: x[:x.rfind('.')])
+    rr = ntt.map(tran_gene.to_dict()['gene_name']).fillna(ntt)
+    adata_subset.var.index = rr
+
+    # gene Irf3 shows up multiple times in the DataFrame. Multiple transcripts map to Irf3. How we take into account duplicates?
+    expdf = pd.DataFrame(adata_subset.X,index = adata_subset.obs.index,columns = adata_subset.var.index)
+    high_var_expdf = expdf.iloc[:,expdf.columns.isin(high_var.index)]
+
+    return expdf.T, adata_subset.var
+
 def generateInputs(RunnerObj):
     '''
     If the folder/files under RunnerObj.datadir exist, 
@@ -59,8 +160,13 @@ def generateInputs(RunnerObj):
         onlyGeness = geneTFDict.item().get('Gene')
     else:
         print("Files not present, creating...")
-        ExpDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
-                                         header = 0, index_col = 0)
+        ExpDF, expr_genes = preprocess_expr(RunnerObj)
+        ExpDF.index = ExpDF.index.str.upper()
+
+        # Percentage of TFs
+        tf_expdf = transcription_factor_percentage(RunnerObj, ExpDF)
+
+        # Replacing this line with a method that preprocesses exprData with scanpy
         GeneralChIP = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.trueEdges))
         # convert strings to upper case, just in case.
         GeneralChIP.Gene1 = GeneralChIP.Gene1.str.upper()

diff --git a/SGRN/runner.py b/SGRN/runner.py
@@ -48,6 +48,12 @@ def __init__(self,
         self.inputDir = params['inputDir']
         self.params = params['params']
         self.exprData = params['exprData']
+        self.delim = params['delim']
+        self.normalization = params['normalization']
+        self.min_genes = params['min_genes']
+        self.min_cells = params['min_cells']
+        self.top_expr_genes = params['top_expr_genes']
+        self.gtf_file= params['gtf_file']
         self.trueEdges = params['trueEdges']      
         self.kTrain = params['kTrain']      
         self.kTest = params['kTest']      
@@ -66,4 +72,4 @@ def run(self):
 
 
     def parseOutput(self):
-        OutputParser[self.name](self)
+        OutputParser[self.name](self)
diff --git a/config/config.yaml b/config/config.yaml
@@ -16,10 +16,21 @@ input_settings:
     #   trueEdges: Name of the refrence network file in the
     #              edge list format. Required.
     datasets:
-        - name: "mESC"
-          exprData: "mESC_BEELINE.csv"
+        - name: "mESC_node_5000"
+          exprData: "GSE98664_tpm_sailfish_GENCODEvM9_RamDA_cells.txt"
+          delim: "\t"
           trueEdges: "mouse-net.csv"
-
+          # Preprocessing
+          normalization: "TPM" #TPM,CPM, None
+          # Minimum number of genes a cell has to have; removing cells
+          min_genes: "200"
+          # Minimum number of cells a gene must be expressed in; removing genes
+          min_cells: "3"
+          top_expr_genes: "5000"
+          # Transcription Factors and Gene Name
+          gtf_file: "Mus_musculus.GRCm39.104.gtf"
+
+
     # Ratio of Negatives:Positives in Training.
     kTrain: 1
     # Ratio of Negatives:Positives in Testing.
@@ -30,13 +41,13 @@ input_settings:
     # Number of times to run the k-fold CV
     # generates random seed for each run internally
     # stores as part of the inputs and ouput name
-    nTimes: 1
+    nTimes: 6 # 10
     # If randSeed is specified, it ignores nTimes!!!
-    randSeed: [774]
+    #randSeed: [774] commenting this out
     # kFold CV type, 'Edge' or 'Node'
     # Edge: Edge CV
     # Node: TF holdout CV
-    CVType: 'Edge'    
+    CVType: 'Node'    
 
     # Denotes a list of algorithms to run. Each has the following parameters:
     #   name: Name of the algorithm. Must be recognized by the pipeline, see
@@ -60,28 +71,28 @@ input_settings:
               reconnect_disconnected_nodes: [False]
 
           # CNNC
-        - name: "CNNC"
-          params:
-              # Set log to False for most datasets!!!
-              # It should be True for datasets from the CNNC paper
-              # i.e., if expression isn't log transformed already
-              log: [False]
-              epochs: [1000]
-              should_run: [True]
+        #- name: "CNNC"
+        #  params:
+        #      # Set log to False for most datasets!!!
+        #      # It should be True for datasets from the CNNC paper
+        #      # i.e., if expression isn't log transformed already
+        #      log: [False]
+        #      epochs: [1000]
+        #      should_run: [True]
 
           # MLP
-        - name: "MLP"
-          params:
-              epochs: [1000]
-              should_run: [True]
-          # SVM
-        - name: "SVM"
-          params:
-              maxIter: [2500]
-              should_run: [True]
+        #- name: "MLP"
+        #  params:
+        #      epochs: [1000]
+        #      should_run: [True]
+        #  # SVM
+        #- name: "SVM"
+        #  params:
+        #      maxIter: [2500]
+        #      should_run: [True]
 
 # Output Settings: initialize base output folder names
 output_settings:
     # Base output directory
     output_dir: "outputs"
-    output_prefix: "mESC"
+    output_prefix: "mESC_node_5000"