Merge pull request #1 from lldelisle/annData

Release v1.1.0
lldelisle · Jun 4, 2021 · 1572c32 · 1572c32
2 parents e76790f + d05bfde
commit 1572c32
Show file tree

Hide file tree

Showing 55 changed files with 3,166 additions and 35 deletions.
diff --git a/baredSC/_version.py b/baredSC/_version.py
@@ -1 +1 @@
-__version__ = '1.0.0'
+__version__ = '1.1.0'
diff --git a/baredSC/baredSC_1d.py b/baredSC/baredSC_1d.py
@@ -11,7 +11,7 @@
 from samsam import sam
 
 # Local imports
-from . common import get_data, permuted, get_Ax, get_prefix_suffix, \
+from . common import get_data_txt, get_data_annData, permuted, get_Ax, get_prefix_suffix, \
     plot_QC, get_bins_centers
 from . oned import logprob, extract_from_npz, write_evidence, \
     get_pdf, plots_from_pdf
@@ -150,11 +150,14 @@ def parse_arguments(args=None):
   argpopt_plot = argp.add_argument_group('Optional arguments to get plots and text outputs')
   argpopt_loge = argp.add_argument_group('Optional arguments to get logevidence')
   # Get data:
-  argprequired.add_argument('--input', default=None, required=True,
-                            help="Input table with one line per cell"
-                            " columns with raw counts and one column"
-                            " nCount_RNA with total number of UMI per cell"
-                            " optionally other meta data to filter.")
+  group = argprequired.add_mutually_exclusive_group(required=True)
+  group.add_argument('--input', default=None,
+                     help="Input table with one line per cell"
+                     " columns with raw counts and one column"
+                     " nCount_RNA with total number of UMI per cell"
+                     " optionally other meta data to filter.")
+  group.add_argument('--inputAnnData', default=None,
+                     help="Input annData (for example from Scanpy).")
   argprequired.add_argument('--geneColName', default=None, required=True,
                             help="Name of the column with gene counts.")
   argpopt_data.add_argument('--metadata1ColName', default=None,
@@ -286,7 +289,14 @@ def main(args=None):
   # Load data
   print("Get raw data")
   start = time.time()
-  data = get_data(args.input,
+  if args.input is not None:
+    input = args.input
+    get_data = get_data_txt
+  else:
+    input = args.inputAnnData
+    get_data = get_data_annData
+
+  data = get_data(input,
                   args.metadata1ColName, args.metadata1Values,
                   args.metadata2ColName, args.metadata2Values,
                   args.metadata3ColName, args.metadata3Values,

diff --git a/baredSC/baredSC_2d.py b/baredSC/baredSC_2d.py
@@ -12,7 +12,7 @@
 from samsam import sam
 
 # Local imports
-from . common import get_data, permuted, get_Ax, get_prefix_suffix, \
+from . common import get_data_txt, get_data_annData, permuted, get_Ax, get_prefix_suffix, \
     plot_QC, get_bins_centers
 from . twod import logprob, extract_from_npz, write_evidence, \
     get_pdf, plots_from_pdf
@@ -238,11 +238,14 @@ def parse_arguments(args=None):
   argpopt_plot = argp.add_argument_group('Optional arguments to get plots and text outputs')
   argpopt_loge = argp.add_argument_group('Optional arguments to get logevidence')
   # To get the data
-  argprequired.add_argument('--input', default=None, required=True,
-                            help="Input table with one line per cell"
-                            " columns with raw counts and one column"
-                            " nCount_RNA with total number of UMI per cell"
-                            " optionaly other meta data to filter.")
+  group = argprequired.add_mutually_exclusive_group(required=True)
+  group.add_argument('--input', default=None,
+                     help="Input table with one line per cell"
+                     " columns with raw counts and one column"
+                     " nCount_RNA with total number of UMI per cell"
+                     " optionally other meta data to filter.")
+  group.add_argument('--inputAnnData', default=None,
+                     help="Input annData (for example from Scanpy).")
   argprequired.add_argument('--geneXColName', default=None, required=True,
                             help="Name of the column with gene counts for gene in x.")
   argprequired.add_argument('--geneYColName', default=None, required=True,
@@ -428,7 +431,14 @@ def main(args=None):
   # Load data
   print("Get raw data")
   start = time.time()
-  data = get_data(args.input,
+  if args.input is not None:
+    input = args.input
+    get_data = get_data_txt
+  else:
+    input = args.inputAnnData
+    get_data = get_data_annData
+
+  data = get_data(input,
                   args.metadata1ColName, args.metadata1Values,
                   args.metadata2ColName, args.metadata2Values,
                   args.metadata3ColName, args.metadata3Values,

diff --git a/baredSC/combineMultipleModels_1d.py b/baredSC/combineMultipleModels_1d.py
@@ -7,7 +7,7 @@
 from tempfile import NamedTemporaryFile
 
 # Local imports
-from . common import get_data, get_bins_centers
+from . common import get_data_txt, get_data_annData, get_bins_centers
 from . oned import logprob, extract_from_npz, write_evidence, \
     get_pdf, plots_from_pdf
 from baredSC._version import __version__
@@ -86,11 +86,14 @@ def parse_arguments(args=None):
   argpopt_plot = argp.add_argument_group('Optional arguments to customize plots and text outputs')
   argpopt_loge = argp.add_argument_group('Optional arguments to evaluate logevidence')
   # Get data:
-  argprequired.add_argument('--input', default=None, required=True,
-                            help="Input table with one line per cell"
-                            " columns with raw counts and one column"
-                            " nCount_RNA with total number of UMI per cell"
-                            " optionally other meta data to filter.")
+  group = argprequired.add_mutually_exclusive_group(required=True)
+  group.add_argument('--input', default=None,
+                     help="Input table with one line per cell"
+                     " columns with raw counts and one column"
+                     " nCount_RNA with total number of UMI per cell"
+                     " optionally other meta data to filter.")
+  group.add_argument('--inputAnnData', default=None,
+                     help="Input annData (for example from Scanpy).")
   argprequired.add_argument('--geneColName', default=None, required=True,
                             help="Name of the column with gene counts.")
   argpopt_data.add_argument('--metadata1ColName', default=None,
@@ -198,7 +201,14 @@ def main(args=None):
   # Load data
   print("Get raw data")
   start = time.time()
-  data = get_data(args.input,
+  if args.input is not None:
+    input = args.input
+    get_data = get_data_txt
+  else:
+    input = args.inputAnnData
+    get_data = get_data_annData
+
+  data = get_data(input,
                   args.metadata1ColName, args.metadata1Values,
                   args.metadata2ColName, args.metadata2Values,
                   args.metadata3ColName, args.metadata3Values,

diff --git a/baredSC/combineMultipleModels_2d.py b/baredSC/combineMultipleModels_2d.py
@@ -7,7 +7,7 @@
 from tempfile import NamedTemporaryFile
 
 # Local imports
-from . common import get_data, get_bins_centers, get_Neff
+from . common import get_data_txt, get_data_annData, get_bins_centers, get_Neff
 from . twod import logprob, extract_from_npz, write_evidence, \
     get_pdf, plots_from_pdf
 from baredSC._version import __version__
@@ -124,11 +124,14 @@ def parse_arguments(args=None):
   argpopt_plot = argp.add_argument_group('Optional arguments to customize plots and text outputs')
   argpopt_loge = argp.add_argument_group('Optional arguments to evaluate logevidence')
   # Get data:
-  argprequired.add_argument('--input', default=None, required=True,
-                            help="Input table with one line per cell"
-                            " columns with raw counts and one column"
-                            " nCount_RNA with total number of UMI per cell"
-                            " optionally other meta data to filter.")
+  group = argprequired.add_mutually_exclusive_group(required=True)
+  group.add_argument('--input', default=None,
+                     help="Input table with one line per cell"
+                     " columns with raw counts and one column"
+                     " nCount_RNA with total number of UMI per cell"
+                     " optionally other meta data to filter.")
+  group.add_argument('--inputAnnData', default=None,
+                     help="Input annData (for example from Scanpy).")
   argprequired.add_argument('--geneXColName', default=None, required=True,
                             help="Name of the column with gene counts for gene in x.")
   argprequired.add_argument('--geneYColName', default=None, required=True,
@@ -295,7 +298,14 @@ def main(args=None):
   # Load data
   print("Get raw data")
   start = time.time()
-  data = get_data(args.input,
+  if args.input is not None:
+    input = args.input
+    get_data = get_data_txt
+  else:
+    input = args.inputAnnData
+    get_data = get_data_annData
+
+  data = get_data(input,
                   args.metadata1ColName, args.metadata1Values,
                   args.metadata2ColName, args.metadata2Values,
                   args.metadata3ColName, args.metadata3Values,

diff --git a/baredSC/common.py b/baredSC/common.py
@@ -4,15 +4,17 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 from scipy.stats import poisson
+from scipy.sparse import issparse
+import anndata
 from samsam import acf
 
 mpl.use('agg')
 
 
-def get_data(input_file, metadata1_cn, metadata1_v,
-             metadata2_cn, metadata2_v,
-             metadata3_cn, metadata3_v,
-             genes):
+def get_data_txt(input_file, metadata1_cn, metadata1_v,
+                 metadata2_cn, metadata2_v,
+                 metadata3_cn, metadata3_v,
+                 genes):
   """
   Create a dataframe from input file
   keep only rows matching metadata values
@@ -34,6 +36,109 @@ def get_data(input_file, metadata1_cn, metadata1_v,
   return(data[genes + ['nCount_RNA']])
 
 
+def get_raw_mat_from_annData(adata, used_raw=False, round_threshold=0.01):
+  """
+  Retrieves the raw counts matrix from annData.
+  We expect that if log and norm were applied
+  it was applied first Norm and then Log
+  """
+  add_message = ""
+  if used_raw:
+    add_message = "in .raw"
+  # Transform sparse to array
+  if issparse(adata.X):
+    X = adata.X.toarray()
+  else:
+    X = np.copy(adata.X)
+  # Check if the data (X) are all positive (not scaled):
+  if not np.all(X >= 0):
+    raise Exception(f"Negative values found {add_message}. Cannot go back to raw counts.")
+  # Check if they are already raw_counts:
+  if np.array_equal(X, X.astype(int)):
+    return(X)
+  # Check if a log was applied
+  if 'log1p' in adata.uns:
+    base = adata.uns['log1p']['base']
+    if base is not None:
+      X *= np.log(base)
+    X = np.expm1(X)
+    add_message += " delogged"
+  # Check if normalization was done:
+  # We assume that if all values
+  # are really close to integer no norm was performed.
+  if np.max(np.abs(X - np.round(X))) < round_threshold:
+    return(np.round(X))
+  # We first check that all values are there:
+  # Try to find the target_sum:
+  N_t = np.median(X.sum(1))
+  if np.max(np.abs(X.sum(1) - N_t)) > 0.1:
+    # Maybe it was logged but the lop1p in uns has not been exported
+    X = np.expm1(X)
+    add_message += " delogged"
+    if np.max(np.abs(X - np.round(X))) < round_threshold:
+      return(np.round(X))
+    N_t = np.median(X.sum(1))
+    if np.max(np.abs(X.sum(1) - N_t)) > 0.1:
+      raise Exception("The sum of expression for each cell"
+                      f" {add_message} is not constant."
+                      " Genes are probably missing.")
+
+  # We could also check the number of genes detected with:
+  # np.array_equal(np.sum(X > 0, axis=1), adata.obs['n_genes_by_counts'])
+  if 'total_counts' not in adata.obs.keys():
+    raise Exception(f"The expression {add_message}seems normalized"
+                    " but 'total_counts' is not in obs."
+                    "Cannot get raw values.")
+  # Denormalize:
+  X = X * adata.obs['total_counts'][:, None] / N_t
+  if np.max(np.abs(X - np.round(X))) < round_threshold:
+    return(np.round(X))
+  else:
+    raise Exception(f"Could not extract raw values of expression {add_message}.")
+
+
+def get_data_annData(input_file, metadata1_cn, metadata1_v,
+                     metadata2_cn, metadata2_v,
+                     metadata3_cn, metadata3_v,
+                     genes):
+  """
+  Create a dataframe from input file
+  keep only rows matching metadata values
+  """
+  # First load the annData:
+  adata = anndata.read(input_file)
+  raw_mat = None
+  used_raw = False
+  # If a raw exists we will use it:
+  if adata.raw is not None:
+    adata = adata.raw.to_adata()
+    used_raw = True
+  raw_mat = get_raw_mat_from_annData(adata, used_raw)
+  data = pd.DataFrame(data=raw_mat,
+                      index=adata.obs_names,
+                      columns=adata.var_names)
+  data = pd.concat([data, adata.obs], axis=1)
+  if 'total_counts' in data.keys():
+    data['nCount_RNA'] = data['total_counts']
+  else:
+    data['nCount_RNA'] = raw_mat.sum(1)
+
+  filters = {metadata1_cn: metadata1_v,
+             metadata2_cn: metadata2_v,
+             metadata3_cn: metadata3_v}
+  for col_name in filters:
+    if col_name is None:
+      continue
+    data = data[data[col_name].astype(str).isin(filters[col_name].split(','))]
+  print(f"You have {data.shape[0]} cells.")
+  if data.shape[0] == 0:
+    raise Exception("No cell with selected filters.")
+  for col_name in genes + ['nCount_RNA']:
+    if col_name not in data.columns:
+      raise Exception(f"{col_name} not in the data table.")
+  return(data[genes + ['nCount_RNA']])
+
+
 # Permut p to get the permutation of p the closest to pref
 def permuted(p, pref, wdist, permut, n_param_per_gauss):
   amp = p[(n_param_per_gauss - 1)::n_param_per_gauss]

diff --git a/baredSC_dev_env.yml b/baredSC_dev_env.yml
@@ -13,6 +13,7 @@ dependencies:
    - pandas >=0.25.0
    - scipy >=1.3.0
    - corner >=2.0.0
+   - anndata >=0.7
    - ipython # For the doc
    - sphinx-argparse # For the doc
    - twine # For the upload

diff --git a/baredSC_env.yml b/baredSC_env.yml
@@ -9,6 +9,7 @@ dependencies:
    - pandas >=0.25.0
    - scipy >=1.3.0
    - corner >=2.0.0
+   - anndata >=0.7
    - pip
    - pip:
       - --extra-index-url https://obswww.unige.ch/~delisle

diff --git a/docs/content/citation.rst b/docs/content/citation.rst
@@ -0,0 +1,6 @@
+Citation
+========
+
+If you use baredSC in your analysis, you can cite the following paper:
+
+Lucille Lopez-Delisle and Jean-Baptiste Delisle. baredSC: Bayesian Approach to Retrieve Expression Distribution of Single-Cell. bioRxiv 2021.05.26.445740; `doi:10.1101/2021.05.26.445740 <https://doi.org/10.1101/2021.05.26.445740>`_.
diff --git a/docs/content/releases.rst b/docs/content/releases.rst
@@ -1,6 +1,15 @@
 Releases
 ========
 
+1.1.0
+-----
+
+Improvements:
+^^^^^^^^^^^^^
+
+- Support annData as input with ``--inputAnnData``
+
+
 1.0.0
 -----
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -20,6 +20,7 @@ baredSC is a tool that uses a Monte-Carlo Markov Chain to estimate a confidence
    content/outputs.rst
    content/tutorial_sim.rst
    content/releases.rst
+   content/citation.rst
 
 Indices and tables
 ==================

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -8,3 +8,4 @@ samsam
 sphinx-argparse
 sphinx-autorun
 IPython
+anndata
diff --git a/example/annData_1d_1gauss.png b/example/annData_1d_1gauss.png
diff --git a/example/annData_1d_1gauss_convergence.png b/example/annData_1d_1gauss_convergence.png
diff --git a/example/annData_1d_1gauss_corner.png b/example/annData_1d_1gauss_corner.png
diff --git a/example/annData_1d_1gauss_individuals.png b/example/annData_1d_1gauss_individuals.png
diff --git a/example/annData_1d_1gauss_logevid.txt b/example/annData_1d_1gauss_logevid.txt
@@ -0,0 +1 @@
+-151.19709986628135
diff --git a/example/annData_1d_1gauss_means.txt.gz b/example/annData_1d_1gauss_means.txt.gz
diff --git a/example/annData_1d_1gauss_neff.txt b/example/annData_1d_1gauss_neff.txt
@@ -0,0 +1 @@
+1994.0513091885653
diff --git a/example/annData_1d_1gauss_p.png b/example/annData_1d_1gauss_p.png
diff --git a/example/annData_1d_1gauss_p.txt b/example/annData_1d_1gauss_p.txt
@@ -0,0 +1,3 @@
+name	low	median	high
+mu0	-1.6792912752225149	0.2678167039707984	0.7393956920541539
+scale0	0.7895420573518143	1.0515526423007606	1.749748203427554