metadata AWS (#860)

Zethson · web-flow · commit 275f67950510 · 2025-10-03T13:49:04.000+02:00
Signed-off-by: Lukas Heumos &lt;lukas.heumos@posteo.net&gt;
diff --git a/pertpy/metadata/_cell_line.py b/pertpy/metadata/_cell_line.py
@@ -48,7 +48,7 @@ def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"
             depmap_cell_line_path = Path(settings.cachedir) / "depmap_23Q4_info.csv"
             if not Path(depmap_cell_line_path).exists():
                 _download(
-                    url="https://ndownloader.figshare.com/files/43746708",
+                    url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/depmap_23Q4_info.csv",
                     output_file_name="depmap_23Q4_info.csv",
                     output_path=settings.cachedir,
                     block_size=4096,
@@ -59,52 +59,24 @@ def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"
         else:
             # Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project
             # Source: https://www.cancerrxgene.org/celllines
-            cancerxgene_cell_line_path = Path(settings.cachedir) / "cell_line_cancer_project.csv"
             transformed_cancerxgene_cell_line_path = Path(settings.cachedir) / "cancerrxgene_info.csv"
-
-            if not Path(transformed_cancerxgene_cell_line_path).exists():
-                if not Path(cancerxgene_cell_line_path).exists():
-                    _download(
-                        url="https://www.cancerrxgene.org/api/celllines?list=all&sEcho=1&iColumns=7&sColumns=&"
-                        "iDisplayStart=0&iDisplayLength=25&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&"
-                        "mDataProp_4=4&mDataProp_5=5&mDataProp_6=6&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&"
-                        "bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&"
-                        "bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&"
-                        "bSearchable_4=true&sSearch_5=&bRegex_5=false&bSearchable_5=true&sSearch_6=&bRegex_6=false&"
-                        "bSearchable_6=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&"
-                        "bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true&bSortable_6=true&export=csv",
-                        output_file_name="cell_line_cancer_project.csv",
-                        output_path=settings.cachedir,
-                        block_size=4096,
-                        is_zip=False,
-                    )
-                self.cancerxgene = pd.read_csv(cancerxgene_cell_line_path)
-                self.cancerxgene.columns = self.cancerxgene.columns.str.strip()
-                self.cancerxgene["stripped_cell_line_name"] = (
-                    self.cancerxgene["Cell line Name"]
-                    .str.replace(r"\-|\.", "", regex=True)
-                    .str.upper()
-                    .astype("category")
+            if not transformed_cancerxgene_cell_line_path.exists():
+                _download(
+                    url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/cancerrxgene_info.csv",
+                    output_file_name="cancerrxgene_info.csv",
+                    output_path=settings.cachedir,
+                    block_size=4096,
+                    is_zip=False,
                 )
-                # pivot the data frame so that each cell line has only one row of metadata
-                index_col = set(self.cancerxgene.columns) - {
-                    "Datasets",
-                    "number of drugs",
-                }
-                self.cancerxgene = self.cancerxgene.pivot(index=index_col, columns="Datasets", values="number of drugs")
-                self.cancerxgene.columns.name = None
-                self.cancerxgene = self.cancerxgene.reset_index().rename(columns={"Cell line Name": "cell_line_name"})
-                self.cancerxgene.to_csv(transformed_cancerxgene_cell_line_path)
-            else:
-                self.cancerxgene = pd.read_csv(transformed_cancerxgene_cell_line_path, index_col=0)
+            self.cancerxgene = pd.read_csv(transformed_cancerxgene_cell_line_path, index_col=0)
 
     def _download_gene_annotation(self) -> None:
         # Download metadata for driver genes from DepMap.Sanger
         # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
         gene_annotation_file_path = Path(settings.cachedir) / "genes_info.csv"
         if not Path(gene_annotation_file_path).exists():
             _download(
-                url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv",
+                url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/genes_info.csv",
                 output_file_name="genes_info.csv",
                 output_path=settings.cachedir,
                 block_size=4096,
@@ -121,7 +93,7 @@ def _download_bulk_rna(self, cell_line_source: Literal["broad", "sanger"] = "bro
             bulk_rna_sanger_file_path = Path(settings.cachedir) / "rnaseq_sanger_info.csv"
             if not Path(bulk_rna_sanger_file_path).exists():
                 _download(
-                    url="https://figshare.com/ndownloader/files/42467103",
+                    url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/rnaseq_sanger_info.csv",
                     output_file_name="rnaseq_sanger_info.csv",
                     output_path=settings.cachedir,
                     block_size=4096,
@@ -134,7 +106,7 @@ def _download_bulk_rna(self, cell_line_source: Literal["broad", "sanger"] = "bro
             bulk_rna_broad_file_path = Path(settings.cachedir) / "rnaseq_depmap_info.csv"
             if not Path(bulk_rna_broad_file_path).exists():
                 _download(
-                    url="https://figshare.com/ndownloader/files/34989922",
+                    url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/rnaseq_depmap_info.csv",
                     output_file_name="rnaseq_depmap_info.csv",
                     output_path=settings.cachedir,
                     block_size=4096,
@@ -148,7 +120,7 @@ def _download_proteomics(self) -> None:
         proteomics_file_path = Path(settings.cachedir) / "proteomics_info.csv"
         if not Path(proteomics_file_path).exists():
             _download(
-                url="https://figshare.com/ndownloader/files/42468393",
+                url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/proteomics_info.csv",
                 output_file_name="proteomics_info.csv",
                 output_path=settings.cachedir,
                 block_size=4096,
@@ -164,7 +136,7 @@ def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
             drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
             if not Path(drug_response_gdsc1_file_path).exists():
                 _download(
-                    url="https://figshare.com/ndownloader/files/43757235",
+                    url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/gdsc1_info.csv",
                     output_file_name="gdsc1_info.csv",
                     output_path=settings.cachedir,
                     block_size=4096,
@@ -175,7 +147,7 @@ def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
             drug_response_gdsc2_file_path = Path(settings.cachedir) / "gdsc2_info.csv"
             if not Path(drug_response_gdsc2_file_path).exists():
                 _download(
-                    url="https://figshare.com/ndownloader/files/43757232",
+                    url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/gdsc2_info.csv",
                     output_file_name="gdsc2_info.csv",
                     output_path=settings.cachedir,
                     block_size=4096,
@@ -189,7 +161,7 @@ def _download_prism(self) -> None:
         drug_response_prism_file_path = Path(settings.cachedir) / "prism_info.csv"
         if not Path(drug_response_prism_file_path).exists():
             _download(
-                url="https://figshare.com/ndownloader/files/20237739",
+                url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/prism_info.csv",
                 output_file_name="prism_info.csv",
                 output_path=settings.cachedir,
                 block_size=4096,
@@ -253,7 +225,7 @@ def annotate(
                 query_id = "stripped_cell_line_name"
                 logger.error(
                     "`stripped_cell_line_name` is used as reference and query identifier to annotate cell line metadata from Cancerrxgene. "
-                    "Ensure that stripped cell line names are available in 'adata.obs.' or use the DepMap as `cell_line_source` to annotate the cell line first."
+                    "Ensure that stripped cell line names are available in 'adata.obs.' or use the DepMap as `cell_line_source` first."
                 )
             if self.cancerxgene is None:
                 self._download_cell_line(cell_line_source="Cancerrxgene")
diff --git a/pertpy/metadata/_drug.py b/pertpy/metadata/_drug.py
@@ -27,7 +27,7 @@ def _download_drug_annotation(
         chembl_path = Path(settings.cachedir) / "chembl.json"
         if not Path(chembl_path).exists():
             _download(
-                url="https://figshare.com/ndownloader/files/43871718",
+                url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/chembl.json",
                 output_file_name="chembl.json",
                 output_path=settings.cachedir,
                 block_size=4096,
@@ -41,7 +41,7 @@ def _download_drug_annotation(
         dgidb_path = Path(settings.cachedir) / "dgidb.tsv"
         if not Path(dgidb_path).exists():
             _download(
-                url="https://www.dgidb.org/data/latest/interactions.tsv",
+                url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/dgidb.tsv",
                 output_file_name="dgidb.tsv",
                 output_path=settings.cachedir,
                 block_size=4096,
@@ -54,7 +54,7 @@ def _download_drug_annotation(
         pharmgkb_path = Path(settings.cachedir) / "pharmgkb.tsv"
         if not Path(pharmgkb_path).exists():
             _download(
-                url="https://api.pharmgkb.org/v1/download/file/data/relationships.zip",
+                url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/pharmgkb.zip",
                 output_file_name="pharmgkb.zip",
                 output_path=settings.cachedir,
                 block_size=4096,
diff --git a/pertpy/metadata/_moa.py b/pertpy/metadata/_moa.py
@@ -26,7 +26,7 @@ def _download_clue(self) -> None:
         clue_path = Path(settings.cachedir) / "repurposing_drugs_20200324.txt"
         if not Path(clue_path).exists():
             _download(
-                url="https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20200324.txt",
+                url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/repurposing_drugs_20200324.txt",
                 output_file_name="repurposing_drugs_20200324.txt",
                 output_path=settings.cachedir,
                 block_size=4096,
diff --git a/pertpy/tools/_differential_gene_expression/_simple_tests.py b/pertpy/tools/_differential_gene_expression/_simple_tests.py
@@ -191,8 +191,8 @@ class PermutationTest(SimpleComparisonBase):
     The permutation test relies on another test statistic (e.g. t-statistic or your own) to obtain a p-value through
     random permutations of the data and repeated generation of the test statistic.
 
-    For paired tests, each paired observation is permuted together and distributed randomly between the two groups. For
-    unpaired tests, all observations are permuted independently.
+    For paired tests, each paired observation is permuted together and distributed randomly between the two groups.
+    For unpaired tests, all observations are permuted independently.
 
     The null hypothesis for the unpaired test is that all observations come from the same underlying distribution and
     have been randomly assigned to one of the samples.
@@ -224,20 +224,21 @@ def compare_groups(
             adata: Data with observations to compare.
             column: Column in `adata.obs` that contains the groups to compare.
             baseline: Reference group.
-            groups_to_compare: Groups to compare against the baseline. If None, all other groups
-                are compared.
-            paired_by: Column in `adata.obs` to use for pairing. If None, an unpaired test is performed.
+            groups_to_compare: Groups to compare against the baseline.
+                If None, all other groups are compared.
+            paired_by: Column in `adata.obs` to use for pairing.
+                If None, an unpaired test is performed.
             mask: Mask to apply to the data.
             layer: Layer to use for the comparison.
             n_permutations: Number of permutations to perform.
             test_statistic: A callable that takes two arrays (x0, x1) and returns a float statistic.
                 Defaults to log2 fold change with pseudocount: log2(mean(x1) + 1e-8) - log2(mean(x0) + 1e-8).
                 The callable should have signature: test_statistic(x0, x1) -> float.
             fit_kwargs: Unused argument for compatibility with the `MethodBase` interface, do not specify.
-            test_kwargs: Additional kwargs passed to the permutation test function (not the test statistic). The
-                permutation test function is `scipy.stats.permutation_test`, so please refer to its documentation for
-                available options. Note that `test_statistic` and `n_permutations` are set by this function and should
-                not be provided here.
+            test_kwargs: Additional kwargs passed to the permutation test function (not the test statistic).
+                The permutation test function is `scipy.stats.permutation_test`.
+                We refer to its documentation for available options.
+                Note that `test_statistic` and `n_permutations` are set by this function and should not be provided here.
 
         Examples:
             >>> # Difference in means (log fold change)
@@ -284,8 +285,9 @@ def _test(
             x0: Array with baseline values.
             x1: Array with values to compare.
             paired: Whether to perform a paired test.
-            test_statistic: A callable that takes two arrays (x0, x1) and returns a float statistic. Please refer to
-                the examples below for usage. The callable should have signature: test_statistic(x0, x1) -> float.
+            test_statistic: A callable that takes two arrays (x0, x1) and returns a float statistic.
+                Please refer to the examples below for usage.
+                The callable should have signature: test_statistic(x0, x1) -> float.
             n_permutations: Number of permutations to perform.
             **kwargs: Additional kwargs passed to scipy.stats.permutation_test.