Skip to content

Commit 275f679

Browse files
authored
metadata AWS (#860)
Signed-off-by: Lukas Heumos <[email protected]>
1 parent ef25311 commit 275f679

File tree

4 files changed

+34
-60
lines changed

4 files changed

+34
-60
lines changed

pertpy/metadata/_cell_line.py

Lines changed: 17 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"
4848
depmap_cell_line_path = Path(settings.cachedir) / "depmap_23Q4_info.csv"
4949
if not Path(depmap_cell_line_path).exists():
5050
_download(
51-
url="https://ndownloader.figshare.com/files/43746708",
51+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/depmap_23Q4_info.csv",
5252
output_file_name="depmap_23Q4_info.csv",
5353
output_path=settings.cachedir,
5454
block_size=4096,
@@ -59,52 +59,24 @@ def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"
5959
else:
6060
# Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project
6161
# Source: https://www.cancerrxgene.org/celllines
62-
cancerxgene_cell_line_path = Path(settings.cachedir) / "cell_line_cancer_project.csv"
6362
transformed_cancerxgene_cell_line_path = Path(settings.cachedir) / "cancerrxgene_info.csv"
64-
65-
if not Path(transformed_cancerxgene_cell_line_path).exists():
66-
if not Path(cancerxgene_cell_line_path).exists():
67-
_download(
68-
url="https://www.cancerrxgene.org/api/celllines?list=all&sEcho=1&iColumns=7&sColumns=&"
69-
"iDisplayStart=0&iDisplayLength=25&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&"
70-
"mDataProp_4=4&mDataProp_5=5&mDataProp_6=6&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&"
71-
"bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&"
72-
"bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&"
73-
"bSearchable_4=true&sSearch_5=&bRegex_5=false&bSearchable_5=true&sSearch_6=&bRegex_6=false&"
74-
"bSearchable_6=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&"
75-
"bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true&bSortable_6=true&export=csv",
76-
output_file_name="cell_line_cancer_project.csv",
77-
output_path=settings.cachedir,
78-
block_size=4096,
79-
is_zip=False,
80-
)
81-
self.cancerxgene = pd.read_csv(cancerxgene_cell_line_path)
82-
self.cancerxgene.columns = self.cancerxgene.columns.str.strip()
83-
self.cancerxgene["stripped_cell_line_name"] = (
84-
self.cancerxgene["Cell line Name"]
85-
.str.replace(r"\-|\.", "", regex=True)
86-
.str.upper()
87-
.astype("category")
63+
if not transformed_cancerxgene_cell_line_path.exists():
64+
_download(
65+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/cancerrxgene_info.csv",
66+
output_file_name="cancerrxgene_info.csv",
67+
output_path=settings.cachedir,
68+
block_size=4096,
69+
is_zip=False,
8870
)
89-
# pivot the data frame so that each cell line has only one row of metadata
90-
index_col = set(self.cancerxgene.columns) - {
91-
"Datasets",
92-
"number of drugs",
93-
}
94-
self.cancerxgene = self.cancerxgene.pivot(index=index_col, columns="Datasets", values="number of drugs")
95-
self.cancerxgene.columns.name = None
96-
self.cancerxgene = self.cancerxgene.reset_index().rename(columns={"Cell line Name": "cell_line_name"})
97-
self.cancerxgene.to_csv(transformed_cancerxgene_cell_line_path)
98-
else:
99-
self.cancerxgene = pd.read_csv(transformed_cancerxgene_cell_line_path, index_col=0)
71+
self.cancerxgene = pd.read_csv(transformed_cancerxgene_cell_line_path, index_col=0)
10072

10173
def _download_gene_annotation(self) -> None:
10274
# Download metadata for driver genes from DepMap.Sanger
10375
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
10476
gene_annotation_file_path = Path(settings.cachedir) / "genes_info.csv"
10577
if not Path(gene_annotation_file_path).exists():
10678
_download(
107-
url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv",
79+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/genes_info.csv",
10880
output_file_name="genes_info.csv",
10981
output_path=settings.cachedir,
11082
block_size=4096,
@@ -121,7 +93,7 @@ def _download_bulk_rna(self, cell_line_source: Literal["broad", "sanger"] = "bro
12193
bulk_rna_sanger_file_path = Path(settings.cachedir) / "rnaseq_sanger_info.csv"
12294
if not Path(bulk_rna_sanger_file_path).exists():
12395
_download(
124-
url="https://figshare.com/ndownloader/files/42467103",
96+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/rnaseq_sanger_info.csv",
12597
output_file_name="rnaseq_sanger_info.csv",
12698
output_path=settings.cachedir,
12799
block_size=4096,
@@ -134,7 +106,7 @@ def _download_bulk_rna(self, cell_line_source: Literal["broad", "sanger"] = "bro
134106
bulk_rna_broad_file_path = Path(settings.cachedir) / "rnaseq_depmap_info.csv"
135107
if not Path(bulk_rna_broad_file_path).exists():
136108
_download(
137-
url="https://figshare.com/ndownloader/files/34989922",
109+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/rnaseq_depmap_info.csv",
138110
output_file_name="rnaseq_depmap_info.csv",
139111
output_path=settings.cachedir,
140112
block_size=4096,
@@ -148,7 +120,7 @@ def _download_proteomics(self) -> None:
148120
proteomics_file_path = Path(settings.cachedir) / "proteomics_info.csv"
149121
if not Path(proteomics_file_path).exists():
150122
_download(
151-
url="https://figshare.com/ndownloader/files/42468393",
123+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/proteomics_info.csv",
152124
output_file_name="proteomics_info.csv",
153125
output_path=settings.cachedir,
154126
block_size=4096,
@@ -164,7 +136,7 @@ def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
164136
drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
165137
if not Path(drug_response_gdsc1_file_path).exists():
166138
_download(
167-
url="https://figshare.com/ndownloader/files/43757235",
139+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/gdsc1_info.csv",
168140
output_file_name="gdsc1_info.csv",
169141
output_path=settings.cachedir,
170142
block_size=4096,
@@ -175,7 +147,7 @@ def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
175147
drug_response_gdsc2_file_path = Path(settings.cachedir) / "gdsc2_info.csv"
176148
if not Path(drug_response_gdsc2_file_path).exists():
177149
_download(
178-
url="https://figshare.com/ndownloader/files/43757232",
150+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/gdsc2_info.csv",
179151
output_file_name="gdsc2_info.csv",
180152
output_path=settings.cachedir,
181153
block_size=4096,
@@ -189,7 +161,7 @@ def _download_prism(self) -> None:
189161
drug_response_prism_file_path = Path(settings.cachedir) / "prism_info.csv"
190162
if not Path(drug_response_prism_file_path).exists():
191163
_download(
192-
url="https://figshare.com/ndownloader/files/20237739",
164+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/prism_info.csv",
193165
output_file_name="prism_info.csv",
194166
output_path=settings.cachedir,
195167
block_size=4096,
@@ -253,7 +225,7 @@ def annotate(
253225
query_id = "stripped_cell_line_name"
254226
logger.error(
255227
"`stripped_cell_line_name` is used as reference and query identifier to annotate cell line metadata from Cancerrxgene. "
256-
"Ensure that stripped cell line names are available in 'adata.obs.' or use the DepMap as `cell_line_source` to annotate the cell line first."
228+
"Ensure that stripped cell line names are available in 'adata.obs.' or use the DepMap as `cell_line_source` first."
257229
)
258230
if self.cancerxgene is None:
259231
self._download_cell_line(cell_line_source="Cancerrxgene")

pertpy/metadata/_drug.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def _download_drug_annotation(
2727
chembl_path = Path(settings.cachedir) / "chembl.json"
2828
if not Path(chembl_path).exists():
2929
_download(
30-
url="https://figshare.com/ndownloader/files/43871718",
30+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/chembl.json",
3131
output_file_name="chembl.json",
3232
output_path=settings.cachedir,
3333
block_size=4096,
@@ -41,7 +41,7 @@ def _download_drug_annotation(
4141
dgidb_path = Path(settings.cachedir) / "dgidb.tsv"
4242
if not Path(dgidb_path).exists():
4343
_download(
44-
url="https://www.dgidb.org/data/latest/interactions.tsv",
44+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/dgidb.tsv",
4545
output_file_name="dgidb.tsv",
4646
output_path=settings.cachedir,
4747
block_size=4096,
@@ -54,7 +54,7 @@ def _download_drug_annotation(
5454
pharmgkb_path = Path(settings.cachedir) / "pharmgkb.tsv"
5555
if not Path(pharmgkb_path).exists():
5656
_download(
57-
url="https://api.pharmgkb.org/v1/download/file/data/relationships.zip",
57+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/pharmgkb.zip",
5858
output_file_name="pharmgkb.zip",
5959
output_path=settings.cachedir,
6060
block_size=4096,

pertpy/metadata/_moa.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def _download_clue(self) -> None:
2626
clue_path = Path(settings.cachedir) / "repurposing_drugs_20200324.txt"
2727
if not Path(clue_path).exists():
2828
_download(
29-
url="https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20200324.txt",
29+
url="https://scverse-exampledata.s3.eu-west-1.amazonaws.com/pertpy/repurposing_drugs_20200324.txt",
3030
output_file_name="repurposing_drugs_20200324.txt",
3131
output_path=settings.cachedir,
3232
block_size=4096,

pertpy/tools/_differential_gene_expression/_simple_tests.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,8 @@ class PermutationTest(SimpleComparisonBase):
191191
The permutation test relies on another test statistic (e.g. t-statistic or your own) to obtain a p-value through
192192
random permutations of the data and repeated generation of the test statistic.
193193
194-
For paired tests, each paired observation is permuted together and distributed randomly between the two groups. For
195-
unpaired tests, all observations are permuted independently.
194+
For paired tests, each paired observation is permuted together and distributed randomly between the two groups.
195+
For unpaired tests, all observations are permuted independently.
196196
197197
The null hypothesis for the unpaired test is that all observations come from the same underlying distribution and
198198
have been randomly assigned to one of the samples.
@@ -224,20 +224,21 @@ def compare_groups(
224224
adata: Data with observations to compare.
225225
column: Column in `adata.obs` that contains the groups to compare.
226226
baseline: Reference group.
227-
groups_to_compare: Groups to compare against the baseline. If None, all other groups
228-
are compared.
229-
paired_by: Column in `adata.obs` to use for pairing. If None, an unpaired test is performed.
227+
groups_to_compare: Groups to compare against the baseline.
228+
If None, all other groups are compared.
229+
paired_by: Column in `adata.obs` to use for pairing.
230+
If None, an unpaired test is performed.
230231
mask: Mask to apply to the data.
231232
layer: Layer to use for the comparison.
232233
n_permutations: Number of permutations to perform.
233234
test_statistic: A callable that takes two arrays (x0, x1) and returns a float statistic.
234235
Defaults to log2 fold change with pseudocount: log2(mean(x1) + 1e-8) - log2(mean(x0) + 1e-8).
235236
The callable should have signature: test_statistic(x0, x1) -> float.
236237
fit_kwargs: Unused argument for compatibility with the `MethodBase` interface, do not specify.
237-
test_kwargs: Additional kwargs passed to the permutation test function (not the test statistic). The
238-
permutation test function is `scipy.stats.permutation_test`, so please refer to its documentation for
239-
available options. Note that `test_statistic` and `n_permutations` are set by this function and should
240-
not be provided here.
238+
test_kwargs: Additional kwargs passed to the permutation test function (not the test statistic).
239+
The permutation test function is `scipy.stats.permutation_test`.
240+
We refer to its documentation for available options.
241+
Note that `test_statistic` and `n_permutations` are set by this function and should not be provided here.
241242
242243
Examples:
243244
>>> # Difference in means (log fold change)
@@ -284,8 +285,9 @@ def _test(
284285
x0: Array with baseline values.
285286
x1: Array with values to compare.
286287
paired: Whether to perform a paired test.
287-
test_statistic: A callable that takes two arrays (x0, x1) and returns a float statistic. Please refer to
288-
the examples below for usage. The callable should have signature: test_statistic(x0, x1) -> float.
288+
test_statistic: A callable that takes two arrays (x0, x1) and returns a float statistic.
289+
Please refer to the examples below for usage.
290+
The callable should have signature: test_statistic(x0, x1) -> float.
289291
n_permutations: Number of permutations to perform.
290292
**kwargs: Additional kwargs passed to scipy.stats.permutation_test.
291293

0 commit comments

Comments
 (0)