Skip to content

Commit

Permalink
Merge pull request #1 from lldelisle/annData
Browse files Browse the repository at this point in the history
Release v1.1.0
  • Loading branch information
lldelisle committed Jun 4, 2021
2 parents e76790f + d05bfde commit 1572c32
Show file tree
Hide file tree
Showing 55 changed files with 3,166 additions and 35 deletions.
2 changes: 1 addition & 1 deletion baredSC/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.0.0'
__version__ = '1.1.0'
24 changes: 17 additions & 7 deletions baredSC/baredSC_1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from samsam import sam

# Local imports
from . common import get_data, permuted, get_Ax, get_prefix_suffix, \
from . common import get_data_txt, get_data_annData, permuted, get_Ax, get_prefix_suffix, \
plot_QC, get_bins_centers
from . oned import logprob, extract_from_npz, write_evidence, \
get_pdf, plots_from_pdf
Expand Down Expand Up @@ -150,11 +150,14 @@ def parse_arguments(args=None):
argpopt_plot = argp.add_argument_group('Optional arguments to get plots and text outputs')
argpopt_loge = argp.add_argument_group('Optional arguments to get logevidence')
# Get data:
argprequired.add_argument('--input', default=None, required=True,
help="Input table with one line per cell"
" columns with raw counts and one column"
" nCount_RNA with total number of UMI per cell"
" optionally other meta data to filter.")
group = argprequired.add_mutually_exclusive_group(required=True)
group.add_argument('--input', default=None,
help="Input table with one line per cell"
" columns with raw counts and one column"
" nCount_RNA with total number of UMI per cell"
" optionally other meta data to filter.")
group.add_argument('--inputAnnData', default=None,
help="Input annData (for example from Scanpy).")
argprequired.add_argument('--geneColName', default=None, required=True,
help="Name of the column with gene counts.")
argpopt_data.add_argument('--metadata1ColName', default=None,
Expand Down Expand Up @@ -286,7 +289,14 @@ def main(args=None):
# Load data
print("Get raw data")
start = time.time()
data = get_data(args.input,
if args.input is not None:
input = args.input
get_data = get_data_txt
else:
input = args.inputAnnData
get_data = get_data_annData

data = get_data(input,
args.metadata1ColName, args.metadata1Values,
args.metadata2ColName, args.metadata2Values,
args.metadata3ColName, args.metadata3Values,
Expand Down
24 changes: 17 additions & 7 deletions baredSC/baredSC_2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from samsam import sam

# Local imports
from . common import get_data, permuted, get_Ax, get_prefix_suffix, \
from . common import get_data_txt, get_data_annData, permuted, get_Ax, get_prefix_suffix, \
plot_QC, get_bins_centers
from . twod import logprob, extract_from_npz, write_evidence, \
get_pdf, plots_from_pdf
Expand Down Expand Up @@ -238,11 +238,14 @@ def parse_arguments(args=None):
argpopt_plot = argp.add_argument_group('Optional arguments to get plots and text outputs')
argpopt_loge = argp.add_argument_group('Optional arguments to get logevidence')
# To get the data
argprequired.add_argument('--input', default=None, required=True,
help="Input table with one line per cell"
" columns with raw counts and one column"
" nCount_RNA with total number of UMI per cell"
" optionaly other meta data to filter.")
group = argprequired.add_mutually_exclusive_group(required=True)
group.add_argument('--input', default=None,
help="Input table with one line per cell"
" columns with raw counts and one column"
" nCount_RNA with total number of UMI per cell"
" optionally other meta data to filter.")
group.add_argument('--inputAnnData', default=None,
help="Input annData (for example from Scanpy).")
argprequired.add_argument('--geneXColName', default=None, required=True,
help="Name of the column with gene counts for gene in x.")
argprequired.add_argument('--geneYColName', default=None, required=True,
Expand Down Expand Up @@ -428,7 +431,14 @@ def main(args=None):
# Load data
print("Get raw data")
start = time.time()
data = get_data(args.input,
if args.input is not None:
input = args.input
get_data = get_data_txt
else:
input = args.inputAnnData
get_data = get_data_annData

data = get_data(input,
args.metadata1ColName, args.metadata1Values,
args.metadata2ColName, args.metadata2Values,
args.metadata3ColName, args.metadata3Values,
Expand Down
24 changes: 17 additions & 7 deletions baredSC/combineMultipleModels_1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tempfile import NamedTemporaryFile

# Local imports
from . common import get_data, get_bins_centers
from . common import get_data_txt, get_data_annData, get_bins_centers
from . oned import logprob, extract_from_npz, write_evidence, \
get_pdf, plots_from_pdf
from baredSC._version import __version__
Expand Down Expand Up @@ -86,11 +86,14 @@ def parse_arguments(args=None):
argpopt_plot = argp.add_argument_group('Optional arguments to customize plots and text outputs')
argpopt_loge = argp.add_argument_group('Optional arguments to evaluate logevidence')
# Get data:
argprequired.add_argument('--input', default=None, required=True,
help="Input table with one line per cell"
" columns with raw counts and one column"
" nCount_RNA with total number of UMI per cell"
" optionally other meta data to filter.")
group = argprequired.add_mutually_exclusive_group(required=True)
group.add_argument('--input', default=None,
help="Input table with one line per cell"
" columns with raw counts and one column"
" nCount_RNA with total number of UMI per cell"
" optionally other meta data to filter.")
group.add_argument('--inputAnnData', default=None,
help="Input annData (for example from Scanpy).")
argprequired.add_argument('--geneColName', default=None, required=True,
help="Name of the column with gene counts.")
argpopt_data.add_argument('--metadata1ColName', default=None,
Expand Down Expand Up @@ -198,7 +201,14 @@ def main(args=None):
# Load data
print("Get raw data")
start = time.time()
data = get_data(args.input,
if args.input is not None:
input = args.input
get_data = get_data_txt
else:
input = args.inputAnnData
get_data = get_data_annData

data = get_data(input,
args.metadata1ColName, args.metadata1Values,
args.metadata2ColName, args.metadata2Values,
args.metadata3ColName, args.metadata3Values,
Expand Down
24 changes: 17 additions & 7 deletions baredSC/combineMultipleModels_2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tempfile import NamedTemporaryFile

# Local imports
from . common import get_data, get_bins_centers, get_Neff
from . common import get_data_txt, get_data_annData, get_bins_centers, get_Neff
from . twod import logprob, extract_from_npz, write_evidence, \
get_pdf, plots_from_pdf
from baredSC._version import __version__
Expand Down Expand Up @@ -124,11 +124,14 @@ def parse_arguments(args=None):
argpopt_plot = argp.add_argument_group('Optional arguments to customize plots and text outputs')
argpopt_loge = argp.add_argument_group('Optional arguments to evaluate logevidence')
# Get data:
argprequired.add_argument('--input', default=None, required=True,
help="Input table with one line per cell"
" columns with raw counts and one column"
" nCount_RNA with total number of UMI per cell"
" optionally other meta data to filter.")
group = argprequired.add_mutually_exclusive_group(required=True)
group.add_argument('--input', default=None,
help="Input table with one line per cell"
" columns with raw counts and one column"
" nCount_RNA with total number of UMI per cell"
" optionally other meta data to filter.")
group.add_argument('--inputAnnData', default=None,
help="Input annData (for example from Scanpy).")
argprequired.add_argument('--geneXColName', default=None, required=True,
help="Name of the column with gene counts for gene in x.")
argprequired.add_argument('--geneYColName', default=None, required=True,
Expand Down Expand Up @@ -295,7 +298,14 @@ def main(args=None):
# Load data
print("Get raw data")
start = time.time()
data = get_data(args.input,
if args.input is not None:
input = args.input
get_data = get_data_txt
else:
input = args.inputAnnData
get_data = get_data_annData

data = get_data(input,
args.metadata1ColName, args.metadata1Values,
args.metadata2ColName, args.metadata2Values,
args.metadata3ColName, args.metadata3Values,
Expand Down
113 changes: 109 additions & 4 deletions baredSC/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import poisson
from scipy.sparse import issparse
import anndata
from samsam import acf

mpl.use('agg')


def get_data(input_file, metadata1_cn, metadata1_v,
metadata2_cn, metadata2_v,
metadata3_cn, metadata3_v,
genes):
def get_data_txt(input_file, metadata1_cn, metadata1_v,
metadata2_cn, metadata2_v,
metadata3_cn, metadata3_v,
genes):
"""
Create a dataframe from input file
keep only rows matching metadata values
Expand All @@ -34,6 +36,109 @@ def get_data(input_file, metadata1_cn, metadata1_v,
return(data[genes + ['nCount_RNA']])


def get_raw_mat_from_annData(adata, used_raw=False, round_threshold=0.01):
"""
Retrieves the raw counts matrix from annData.
We expect that if log and norm were applied
it was applied first Norm and then Log
"""
add_message = ""
if used_raw:
add_message = "in .raw"
# Transform sparse to array
if issparse(adata.X):
X = adata.X.toarray()
else:
X = np.copy(adata.X)
# Check if the data (X) are all positive (not scaled):
if not np.all(X >= 0):
raise Exception(f"Negative values found {add_message}. Cannot go back to raw counts.")
# Check if they are already raw_counts:
if np.array_equal(X, X.astype(int)):
return(X)
# Check if a log was applied
if 'log1p' in adata.uns:
base = adata.uns['log1p']['base']
if base is not None:
X *= np.log(base)
X = np.expm1(X)
add_message += " delogged"
# Check if normalization was done:
# We assume that if all values
# are really close to integer no norm was performed.
if np.max(np.abs(X - np.round(X))) < round_threshold:
return(np.round(X))
# We first check that all values are there:
# Try to find the target_sum:
N_t = np.median(X.sum(1))
if np.max(np.abs(X.sum(1) - N_t)) > 0.1:
# Maybe it was logged but the lop1p in uns has not been exported
X = np.expm1(X)
add_message += " delogged"
if np.max(np.abs(X - np.round(X))) < round_threshold:
return(np.round(X))
N_t = np.median(X.sum(1))
if np.max(np.abs(X.sum(1) - N_t)) > 0.1:
raise Exception("The sum of expression for each cell"
f" {add_message} is not constant."
" Genes are probably missing.")

# We could also check the number of genes detected with:
# np.array_equal(np.sum(X > 0, axis=1), adata.obs['n_genes_by_counts'])
if 'total_counts' not in adata.obs.keys():
raise Exception(f"The expression {add_message}seems normalized"
" but 'total_counts' is not in obs."
"Cannot get raw values.")
# Denormalize:
X = X * adata.obs['total_counts'][:, None] / N_t
if np.max(np.abs(X - np.round(X))) < round_threshold:
return(np.round(X))
else:
raise Exception(f"Could not extract raw values of expression {add_message}.")


def get_data_annData(input_file, metadata1_cn, metadata1_v,
metadata2_cn, metadata2_v,
metadata3_cn, metadata3_v,
genes):
"""
Create a dataframe from input file
keep only rows matching metadata values
"""
# First load the annData:
adata = anndata.read(input_file)
raw_mat = None
used_raw = False
# If a raw exists we will use it:
if adata.raw is not None:
adata = adata.raw.to_adata()
used_raw = True
raw_mat = get_raw_mat_from_annData(adata, used_raw)
data = pd.DataFrame(data=raw_mat,
index=adata.obs_names,
columns=adata.var_names)
data = pd.concat([data, adata.obs], axis=1)
if 'total_counts' in data.keys():
data['nCount_RNA'] = data['total_counts']
else:
data['nCount_RNA'] = raw_mat.sum(1)

filters = {metadata1_cn: metadata1_v,
metadata2_cn: metadata2_v,
metadata3_cn: metadata3_v}
for col_name in filters:
if col_name is None:
continue
data = data[data[col_name].astype(str).isin(filters[col_name].split(','))]
print(f"You have {data.shape[0]} cells.")
if data.shape[0] == 0:
raise Exception("No cell with selected filters.")
for col_name in genes + ['nCount_RNA']:
if col_name not in data.columns:
raise Exception(f"{col_name} not in the data table.")
return(data[genes + ['nCount_RNA']])


# Permut p to get the permutation of p the closest to pref
def permuted(p, pref, wdist, permut, n_param_per_gauss):
amp = p[(n_param_per_gauss - 1)::n_param_per_gauss]
Expand Down
1 change: 1 addition & 0 deletions baredSC_dev_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
- pandas >=0.25.0
- scipy >=1.3.0
- corner >=2.0.0
- anndata >=0.7
- ipython # For the doc
- sphinx-argparse # For the doc
- twine # For the upload
Expand Down
1 change: 1 addition & 0 deletions baredSC_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies:
- pandas >=0.25.0
- scipy >=1.3.0
- corner >=2.0.0
- anndata >=0.7
- pip
- pip:
- --extra-index-url https://obswww.unige.ch/~delisle
Expand Down
6 changes: 6 additions & 0 deletions docs/content/citation.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Citation
========

If you use baredSC in your analysis, you can cite the following paper:

Lucille Lopez-Delisle and Jean-Baptiste Delisle. baredSC: Bayesian Approach to Retrieve Expression Distribution of Single-Cell. bioRxiv 2021.05.26.445740; `doi:10.1101/2021.05.26.445740 <https://doi.org/10.1101/2021.05.26.445740>`_.
9 changes: 9 additions & 0 deletions docs/content/releases.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
Releases
========

1.1.0
-----

Improvements:
^^^^^^^^^^^^^

- Support annData as input with ``--inputAnnData``


1.0.0
-----

Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ baredSC is a tool that uses a Monte-Carlo Markov Chain to estimate a confidence
content/outputs.rst
content/tutorial_sim.rst
content/releases.rst
content/citation.rst

Indices and tables
==================
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ samsam
sphinx-argparse
sphinx-autorun
IPython
anndata
Binary file added example/annData_1d_1gauss.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added example/annData_1d_1gauss_convergence.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added example/annData_1d_1gauss_corner.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added example/annData_1d_1gauss_individuals.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions example/annData_1d_1gauss_logevid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-151.19709986628135
Binary file added example/annData_1d_1gauss_means.txt.gz
Binary file not shown.
1 change: 1 addition & 0 deletions example/annData_1d_1gauss_neff.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1994.0513091885653
Binary file added example/annData_1d_1gauss_p.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions example/annData_1d_1gauss_p.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name low median high
mu0 -1.6792912752225149 0.2678167039707984 0.7393956920541539
scale0 0.7895420573518143 1.0515526423007606 1.749748203427554
Loading

0 comments on commit 1572c32

Please sign in to comment.