Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions SGRN/CNNCRunner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pandas as pd
import pdb
from pathlib import Path
import numpy as np
import networkx as nx
Expand Down Expand Up @@ -27,7 +28,7 @@
import torch.utils.data as utils
from SGRN.GAEHelpers import *
import SGRN.GAERunner as GR

from SGRN.makeInputs import *



Expand Down Expand Up @@ -107,8 +108,10 @@ def run(RunnerObj, fID):

#exprDF = pd.read_csv(RunnerObj.inputDir.joinpath("normExp.csv"), header = 0, index_col =0)
# NOTE: CNNC requires non-standarized expression data as input
exprDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
header = 0, index_col = 0)
#exprDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
# header = 0, index_col = 0)
exprDF, expr_genes = preprocess_expr(RunnerObj)
exprDF.index = exprDF.index.str.upper()
posE = np.load(RunnerObj.inputDir.joinpath("posE.npy"))
negE = np.load(RunnerObj.inputDir.joinpath("negE.npy"))

Expand Down Expand Up @@ -187,6 +190,7 @@ def run(RunnerObj, fID):
# Compute features for negative examples because
# storing the pre-computed negatives is prohibitive

# keyerror TBX4 nEdge[1] not inside exprDF
trNeg = np.zeros((len(train_negIdx), 32, 32))
for idx in tqdm(range(len(train_negIdx))):
edgeId = train_negIdx[idx]
Expand Down
1 change: 0 additions & 1 deletion SGRN/GAERunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,6 @@ def val(pos_edge_index, neg_edge_index):
outDF.to_csv(output_path, index=False, mode='a', header=not os.path.exists(output_path))



def parseOutput(RunnerObj):
if RunnerObj.name == 'GAE':
algName = RunnerObj.params['encoder'] + '-' +RunnerObj.params['decoder']
Expand Down
6 changes: 6 additions & 0 deletions SGRN/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,13 @@ def __create_runners(self) -> Dict[int, List[Runner]]:
data['params'] = runner[1]
data['inputDir'] = Path.cwd().joinpath(self.input_settings.datadir.joinpath(dataset['name']))
data['exprData'] = dataset['exprData']
data['delim'] = dataset['delim']
data['trueEdges'] = dataset['trueEdges']
data['normalization'] = dataset['normalization']
data['min_genes'] = dataset['min_genes']
data['min_cells'] = dataset['min_cells']
data['top_expr_genes'] = dataset['top_expr_genes']
data['gtf_file'] = dataset['gtf_file']
data['kTrain'] = self.input_settings.kTrain
data['kTest'] = self.input_settings.kTest
data['kFold'] = self.input_settings.kFold
Expand Down
112 changes: 109 additions & 3 deletions SGRN/makeInputs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@

import os
import pandas as pd
from pathlib import Path
import numpy as np
import pdb
import scanpy as sc
import networkx as nx
import random
import math
Expand All @@ -12,6 +13,106 @@
from tqdm import tqdm
from sklearn.model_selection import KFold

def exprdata_stats(adata):
""" Printing Statistics of the single cell RNA sequence datasets including
# of genes, # of cells
"""
print(f'Total number of cells: {adata.n_obs}')
print(f'Total number of genes: {adata.n_vars}')

def read_in_chunks(fileobj, chunksize=1000000):
"""
Reading in a file lazily in increments of 1,000,000 bytes 1 Megabytes
"""
while True:
data = fileobj.read(chunksize)
if not data:
break
yield data

def convert_transcripts_to_genes(RunnerObj):
""" Converting transcripts to genes from Ensembl
"""
info_df = []
prev_line = ''
prev_line2 = ''
with open(os.path.join(RunnerObj.inputDir.joinpath(RunnerObj.gtf_file))) as f:
for piece in read_in_chunks(f):
piece = prev_line + piece
for line in piece.split('\n'):
if line == piece[piece.rfind('\n')+1:]:
prev_line = line
break
if 'gene_name' in line and 'transcript_id' in line:
info = line[line.rfind('\t')+1:]
tmp = {}
for item in info.split('\"; '):
res = item.split(' \"')
if len(res) == 2:
tmp[res[0]] = res[1]
info_df.append([tmp['gene_name'],tmp['transcript_id']])
tran_gene = pd.DataFrame(info_df)
tran_gene.drop_duplicates(keep='first',inplace=True)
tran_gene.columns = ['gene_name','transcript']
tran_gene.set_index('transcript',inplace=True)
return tran_gene

def transcription_factor_percentage(RunnerObj,expdf):
""" Number of Transcription Factors in the single cell RNA seq dataset
"""
tf_path = '/home/kradja/supervised-grns/inputs/TFs'
tf_file = 'mouse-tfs.csv'
tf = pd.read_csv(os.path.join(tf_path,tf_file),sep=',')

tf_expdf = expdf[expdf.index.isin(tf.TF)]
print(f'Out of {len(expdf)} genes there are {len(tf_expdf)} Transcription factors')
pdb.set_trace()
return tf_expdf


def preprocess_expr(RunnerObj):
"""
Preprocessing single cell RNA sequencing data with inputs from the RunnerObj
"""
ExpDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
header = 0, index_col = 0,sep = RunnerObj.delim)
adata = sc.read_csv(os.path.join(RunnerObj.inputDir.joinpath(RunnerObj.exprData))
,delimiter = RunnerObj.delim)
adata = adata.transpose() # AnnData package needs the genes as columns and cells as rows
exprdata_stats(adata)

# Scanpy pre-processing filtering cells and genes and normlaization
if RunnerObj.normalization == '':
sc.pp.normalize_total(adata)
sc.pp.filter_cells(adata,min_genes = int(RunnerObj.min_genes))
sc.pp.filter_genes(adata,min_cells = int(RunnerObj.min_cells))
exprdata_stats(adata)
flavor = 'seurat'
if flavor == 'seurat' or flavor == 'cell_ranger':
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata,flavor = flavor)
if flavor == 'seurat_v3':
sc.pp.highly_variable_genes(adata,flavor = flavor)

# Taking the top 'x' number of highly variable genes
high_var = adata.var[adata.var.highly_variable == True].sort_values('dispersions_norm',ascending=False)
print(f'There are {len(high_var)} highly variable genes out of {len(adata.var)} genes')
high_var = high_var[:int(RunnerObj.top_expr_genes)]
adata_subset = adata[:, high_var.index]

# Converting the indices of high_var with gene from transcript
tran_gene = convert_transcripts_to_genes(RunnerObj)
tt = adata_subset.var.index.to_series()
ntt = tt.apply(lambda x: x[:x.rfind('.')])
rr = ntt.map(tran_gene.to_dict()['gene_name']).fillna(ntt)
adata_subset.var.index = rr

# gene Irf3 shows up multiple times in the DataFrame. Multiple transcripts map to Irf3. How we take into account duplicates?
expdf = pd.DataFrame(adata_subset.X,index = adata_subset.obs.index,columns = adata_subset.var.index)
high_var_expdf = expdf.iloc[:,expdf.columns.isin(high_var.index)]

return expdf.T, adata_subset.var

def generateInputs(RunnerObj):
'''
If the folder/files under RunnerObj.datadir exist,
Expand Down Expand Up @@ -59,8 +160,13 @@ def generateInputs(RunnerObj):
onlyGeness = geneTFDict.item().get('Gene')
else:
print("Files not present, creating...")
ExpDF = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
header = 0, index_col = 0)
ExpDF, expr_genes = preprocess_expr(RunnerObj)
ExpDF.index = ExpDF.index.str.upper()

# Percentage of TFs
tf_expdf = transcription_factor_percentage(RunnerObj, ExpDF)

# Replacing this line with a method that preprocesses exprData with scanpy
GeneralChIP = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.trueEdges))
# convert strings to upper case, just in case.
GeneralChIP.Gene1 = GeneralChIP.Gene1.str.upper()
Expand Down
8 changes: 7 additions & 1 deletion SGRN/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ def __init__(self,
self.inputDir = params['inputDir']
self.params = params['params']
self.exprData = params['exprData']
self.delim = params['delim']
self.normalization = params['normalization']
self.min_genes = params['min_genes']
self.min_cells = params['min_cells']
self.top_expr_genes = params['top_expr_genes']
self.gtf_file= params['gtf_file']
self.trueEdges = params['trueEdges']
self.kTrain = params['kTrain']
self.kTest = params['kTest']
Expand All @@ -66,4 +72,4 @@ def run(self):


def parseOutput(self):
OutputParser[self.name](self)
OutputParser[self.name](self)
59 changes: 35 additions & 24 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,21 @@ input_settings:
# trueEdges: Name of the refrence network file in the
# edge list format. Required.
datasets:
- name: "mESC"
exprData: "mESC_BEELINE.csv"
- name: "mESC_node_5000"
exprData: "GSE98664_tpm_sailfish_GENCODEvM9_RamDA_cells.txt"
delim: "\t"
trueEdges: "mouse-net.csv"

# Preprocessing
normalization: "TPM" #TPM,CPM, None
# Minimum number of genes a cell has to have; removing cells
min_genes: "200"
# Minimum number of cells a gene must be expressed in; removing genes
min_cells: "3"
top_expr_genes: "5000"
# Transcription Factors and Gene Name
gtf_file: "Mus_musculus.GRCm39.104.gtf"


# Ratio of Negatives:Positives in Training.
kTrain: 1
# Ratio of Negatives:Positives in Testing.
Expand All @@ -30,13 +41,13 @@ input_settings:
# Number of times to run the k-fold CV
# generates random seed for each run internally
# stores as part of the inputs and ouput name
nTimes: 1
nTimes: 6 # 10
# If randSeed is specified, it ignores nTimes!!!
randSeed: [774]
#randSeed: [774] commenting this out
# kFold CV type, 'Edge' or 'Node'
# Edge: Edge CV
# Node: TF holdout CV
CVType: 'Edge'
CVType: 'Node'

# Denotes a list of algorithms to run. Each has the following parameters:
# name: Name of the algorithm. Must be recognized by the pipeline, see
Expand All @@ -60,28 +71,28 @@ input_settings:
reconnect_disconnected_nodes: [False]

# CNNC
- name: "CNNC"
params:
# Set log to False for most datasets!!!
# It should be True for datasets from the CNNC paper
# i.e., if expression isn't log transformed already
log: [False]
epochs: [1000]
should_run: [True]
#- name: "CNNC"
# params:
# # Set log to False for most datasets!!!
# # It should be True for datasets from the CNNC paper
# # i.e., if expression isn't log transformed already
# log: [False]
# epochs: [1000]
# should_run: [True]

# MLP
- name: "MLP"
params:
epochs: [1000]
should_run: [True]
# SVM
- name: "SVM"
params:
maxIter: [2500]
should_run: [True]
#- name: "MLP"
# params:
# epochs: [1000]
# should_run: [True]
# # SVM
#- name: "SVM"
# params:
# maxIter: [2500]
# should_run: [True]

# Output Settings: initialize base output folder names
output_settings:
# Base output directory
output_dir: "outputs"
output_prefix: "mESC"
output_prefix: "mESC_node_5000"