Skip to content

Commit 88b251c

Browse files
committed
making SRA looper optional
1 parent 924b558 commit 88b251c

6 files changed

+71
-51
lines changed

MANIFEST.in

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ include README.md
33
include docs/img/geofetch_logo.svg
44
include geofetch/config_template.yaml
55
include geofetch/config_processed_template.yaml
6+
include geofetch/looper_sra_convert.yaml

geofetch/cli.py

+6
Original file line numberDiff line numberDiff line change
@@ -276,5 +276,11 @@ def _parse_cmdl(cmdl):
276276
help="Use just the keys defined in this module when writing out metadata.",
277277
)
278278

279+
raw_group.add_argument(
280+
"--add-convert-modifier",
281+
action="store_true",
282+
help="Add looper SRA convert modifier to config file.",
283+
)
284+
279285
logmuse.add_logging_options(parser)
280286
return parser.parse_args(cmdl)

geofetch/config_template.yaml

+1-44
Original file line numberDiff line numberDiff line change
@@ -10,51 +10,8 @@ sample_modifiers:
1010
# Project metadata:
1111
{additional_columns}
1212
# End of project metadata
13-
SRR_files: SRA
1413
{pipeline_samples}
1514

16-
# Adding additional infromation to the project
17-
derive:
18-
attributes: [read1, read2, SRR_files]
19-
sources:
20-
SRA: "${SRABAM}/{SRR}.bam"
21-
FQ: "${SRAFQ}/{SRR}.fastq.gz"
22-
FQ1: "${SRAFQ}/{SRR}_1.fastq.gz"
23-
FQ2: "${SRAFQ}/{SRR}_2.fastq.gz"
24-
imply:
25-
- if:
26-
organism: "Mus musculus"
27-
then:
28-
genome: mm10
29-
- if:
30-
organism: "Homo sapiens"
31-
then:
32-
genome: hg38
33-
- if:
34-
read_type: "PAIRED"
35-
then:
36-
read1: FQ1
37-
read2: FQ2
38-
- if:
39-
read_type: "SINGLE"
40-
then:
41-
read1: FQ1
42-
43-
project_modifiers:
44-
amend:
45-
sra_convert:
46-
looper:
47-
results_subdir: sra_convert_results
48-
sample_modifiers:
49-
append:
50-
SRR_files: SRA
51-
pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml
52-
derive:
53-
attributes: [read1, read2, SRR_files]
54-
sources:
55-
SRA: "${SRARAW}/{SRR}.sra"
56-
FQ: "${SRAFQ}/{SRR}.fastq.gz"
57-
FQ1: "${SRAFQ}/{SRR}_1.fastq.gz"
58-
FQ2: "${SRAFQ}/{SRR}_2.fastq.gz"
15+
{sra_convert}
5916

6017
{pipeline_project}

geofetch/const.py

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646

4747
CONFIG_PROCESSED_TEMPLATE_NAME = "config_processed_template.yaml"
4848
CONFIG_RAW_TEMPLATE_NAME = "config_template.yaml"
49+
CONFIG_SRA_TEMPLATE = "looper_sra_convert.yaml"
4950

5051
# const for Finder:
5152
RETMAX = 10000000 # once it should be increased

geofetch/geofetch.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ def __init__(
8585
discard_soft: bool = False,
8686
add_dotfile: bool = False,
8787
disable_progressbar: bool = False,
88+
add_convert_modifier: bool = False,
8889
opts=None,
8990
**kwargs,
9091
):
@@ -143,6 +144,7 @@ def __init__(
143144
:param bam_conversion: Optional: set True to convert bam files [Works with raw data]
144145
:param picard_path: Specify a path to the picard jar, if you want to convert fastq to bam
145146
[Default: $PICARD:" + safe_echo("PICARD") + "] [Works with raw data]
147+
:param add_convert_modifier: Add looper SRA convert modifier to config file.
146148
147149
:param skip: Skip some accessions. [Default: no skip].
148150
:param opts: opts object [Optional]
@@ -244,7 +246,7 @@ def __init__(
244246
self.discard_soft = discard_soft
245247
self.add_dotfile = add_dotfile
246248
self.disable_progressbar = disable_progressbar
247-
249+
self.add_convert_modifier = add_convert_modifier
248250
self._LOGGER.info(f"Metadata folder: {self.metadata_expanded}")
249251

250252
# Some sanity checks before proceeding
@@ -638,15 +640,15 @@ def _download_raw_data(self, run_name: str) -> NoReturn:
638640
# converting sra to bam using
639641
# TODO: sam-dump has a built-in prefetch. I don't have to do
640642
# any of this stuff... This also solves the bad sam-dump issues.
641-
self._sra_bam_conversion1(bam_file, run_name)
643+
self._sra_to_bam_conversion_sam_dump(bam_file, run_name)
642644

643645
# checking if bam_file converted correctly, if not --> use fastq-dump
644646
st = os.stat(bam_file)
645647
if st.st_size < 100:
646648
self._LOGGER.warning(
647649
"Bam conversion failed with sam-dump. Trying fastq-dump..."
648650
)
649-
self._sra_bam_conversion2(bam_file, run_name, self.picard_path)
651+
self._sra_to_bam_conversion_fastq_damp(bam_file, run_name, self.picard_path)
650652

651653
except FileNotFoundError as err:
652654
self._LOGGER.info(
@@ -1145,9 +1147,15 @@ def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml):
11451147
]
11461148
modifiers_str = "\n ".join(d for d in meta_list_str)
11471149
# Write project config file
1150+
geofetchdir = os.path.dirname(__file__)
11481151
if not self.config_template:
1149-
geofetchdir = os.path.dirname(__file__)
11501152
self.config_template = os.path.join(geofetchdir, CONFIG_RAW_TEMPLATE_NAME)
1153+
if self.add_convert_modifier:
1154+
sra_convert_path = os.path.join(geofetchdir, CONFIG_SRA_TEMPLATE)
1155+
with open(sra_convert_path, "r") as template_file:
1156+
sra_convert_template = template_file.read()
1157+
else:
1158+
sra_convert_template = ""
11511159
with open(self.config_template, "r") as template_file:
11521160
template = template_file.read()
11531161
template_values = {
@@ -1157,13 +1165,15 @@ def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml):
11571165
"pipeline_samples": self.file_pipeline_samples,
11581166
"pipeline_project": self.file_pipeline_project,
11591167
"additional_columns": modifiers_str,
1168+
"sra_convert": sra_convert_template,
11601169
}
11611170
for k, v in template_values.items():
11621171
placeholder = "{" + str(k) + "}"
11631172
template = template.replace(placeholder, str(v))
11641173
return template
11651174

1166-
def _check_sample_name_standard(self, metadata_dict: dict) -> dict:
1175+
@staticmethod
1176+
def _check_sample_name_standard(metadata_dict: dict) -> dict:
11671177
"""
11681178
Standardizing sample name and checking if it exists
11691179
(This function is used for raw data)
@@ -1291,7 +1301,7 @@ def _download_SRA_file(self, run_name: str):
12911301
)
12921302
time.sleep(t * 2)
12931303

1294-
def _sra_bam_conversion1(self, bam_file: str, run_name: str) -> NoReturn:
1304+
def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoReturn:
12951305
"""
12961306
Converting of SRA file to BAM file by using samtools function "sam-dump"
12971307
:param str bam_file: path to BAM file that has to be created
@@ -1315,7 +1325,7 @@ def _sra_bam_conversion1(self, bam_file: str, run_name: str) -> NoReturn:
13151325
self._LOGGER.info(f"Conversion command: {cmd}")
13161326
run_subprocess(cmd, shell=True)
13171327

1318-
def _sra_bam_conversion2(
1328+
def _sra_to_bam_conversion_fastq_damp(
13191329
self, bam_file: str, run_name: str, picard_path: str = None
13201330
) -> NoReturn:
13211331
"""

geofetch/looper_sra_convert.yaml

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Adding sra convert looper pipeline
2+
SRR_files: SRA
3+
4+
derive:
5+
attributes: [read1, read2, SRR_files]
6+
sources:
7+
SRA: "${SRABAM}/{SRR}.bam"
8+
FQ: "${SRAFQ}/{SRR}.fastq.gz"
9+
FQ1: "${SRAFQ}/{SRR}_1.fastq.gz"
10+
FQ2: "${SRAFQ}/{SRR}_2.fastq.gz"
11+
imply:
12+
- if:
13+
organism: "Mus musculus"
14+
then:
15+
genome: mm10
16+
- if:
17+
organism: "Homo sapiens"
18+
then:
19+
genome: hg38
20+
- if:
21+
read_type: "PAIRED"
22+
then:
23+
read1: FQ1
24+
read2: FQ2
25+
- if:
26+
read_type: "SINGLE"
27+
then:
28+
read1: FQ1
29+
30+
project_modifiers:
31+
amend:
32+
sra_convert:
33+
looper:
34+
results_subdir: sra_convert_results
35+
sample_modifiers:
36+
append:
37+
SRR_files: SRA
38+
pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml
39+
derive:
40+
attributes: [read1, read2, SRR_files]
41+
sources:
42+
SRA: "${SRARAW}/{SRR}.sra"
43+
FQ: "${SRAFQ}/{SRR}.fastq.gz"
44+
FQ1: "${SRAFQ}/{SRR}_1.fastq.gz"
45+
FQ2: "${SRAFQ}/{SRR}_2.fastq.gz"

0 commit comments

Comments
 (0)