Skip to content

Commit 068571e

Browse files
authored
feat!: add input assembly parameter for free text genomic / gnomad vcf queries (#619)
close #490 Initial work from #491 Technically breaking change with the cleanup work I did
1 parent 068337d commit 068571e

17 files changed

+414
-364
lines changed

src/variation/main.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from collections.abc import AsyncGenerator
77
from contextlib import asynccontextmanager
88
from enum import Enum
9-
from typing import Annotated
9+
from typing import Annotated, Literal
1010
from urllib.parse import unquote
1111

1212
import pkg_resources
@@ -40,6 +40,7 @@
4040
TranslateIdentifierService,
4141
)
4242
from variation.schemas.service_schema import (
43+
ClinVarAssembly,
4344
FeatureOverlapService,
4445
ToCdnaService,
4546
ToGenomicService,
@@ -176,6 +177,12 @@ async def normalize(
176177
description="The copy change for HGVS duplications and deletions represented as Copy Number Change Variation.",
177178
),
178179
] = None,
180+
input_assembly: Annotated[
181+
Literal[ClinVarAssembly.GRCH37] | Literal[ClinVarAssembly.GRCH38] | None,
182+
Query(
183+
description="Assembly used for `q`. Only used when `q` is using genomic free text or gnomad vcf format",
184+
),
185+
] = None,
179186
) -> NormalizeService:
180187
"""Normalize and translate a HGVS, gnomAD VCF or Free Text description on GRCh37
181188
or GRCh38 assembly to a single VRS Variation. Performs fully-justified allele
@@ -190,11 +197,14 @@ async def normalize(
190197
:param copy_change: The copy change for HGVS duplications and deletions represented
191198
as Copy Number Change Variation. If not set, will use default `copy_change` for
192199
query.
200+
:param input_assembly: Assembly used for `q`. Only used when `q` is using genomic
201+
free text or gnomad vcf format
193202
:return: NormalizeService for variation
194203
"""
195204
return await query_handler.normalize_handler.normalize(
196205
unquote(q),
197206
hgvs_dup_del_mode=hgvs_dup_del_mode,
207+
input_assembly=input_assembly,
198208
baseline_copies=baseline_copies,
199209
copy_change=copy_change,
200210
)

src/variation/normalize.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Module for Variation Normalization."""
22

33
import datetime
4+
from typing import Literal
45
from urllib.parse import unquote
56

67
from cool_seq_tool.handlers import SeqRepoAccess
@@ -16,6 +17,7 @@
1617
NormalizeService,
1718
ServiceMeta,
1819
)
20+
from variation.schemas.service_schema import ClinVarAssembly
1921
from variation.schemas.token_response_schema import GnomadVcfToken, Token
2022
from variation.schemas.translation_response_schema import (
2123
AC_PRIORITY_LABELS,
@@ -175,14 +177,21 @@ async def normalize(
175177
self,
176178
q: str,
177179
hgvs_dup_del_mode: HGVSDupDelModeOption | None = HGVSDupDelModeOption.DEFAULT,
180+
input_assembly: Literal[ClinVarAssembly.GRCH37, ClinVarAssembly.GRCH38]
181+
| None = None,
178182
baseline_copies: int | None = None,
179183
copy_change: models.CopyChange | None = None,
180184
) -> NormalizeService:
181-
"""Normalize a given variation.
185+
"""Normalize and translate a HGVS, gnomAD VCF or Free Text description on GRCh37
186+
or GRCh38 assembly to a VRS variation. Performs fully-justfied allele
187+
normalization. Will liftover to GRCh38 (if necessary) and align to a priority
188+
transcript. Will make inferences about the query.
182189
183190
:param q: HGVS, gnomAD VCF or Free Text description on GRCh37 or GRCh38 assembly
184191
:param hgvs_dup_del_mode: This parameter determines how to interpret HGVS
185192
dup/del expressions in VRS.
193+
:param input_assembly: Assembly used for `q`. Only used when `q` is using
194+
genomic free text or gnomad vcf format
186195
:param baseline_copies: Baseline copies for HGVS duplications and deletions
187196
:param copy_change: The copy change for HGVS duplications and deletions
188197
represented as Copy Number Change Variation.
@@ -226,7 +235,9 @@ async def normalize(
226235
return NormalizeService(**params)
227236

228237
# Get validation summary for classification
229-
validation_summary = await self.validator.perform(classification)
238+
validation_summary = await self.validator.perform(
239+
classification, input_assembly=input_assembly
240+
)
230241
if not validation_summary:
231242
update_warnings_for_no_resp(label, validation_summary.warnings)
232243
params["warnings"] = warnings

src/variation/translators/translator.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
VrsSeqLocAcStatus,
2020
)
2121
from variation.schemas.validation_response_schema import ValidationResult
22-
from variation.validators.genomic_base import GenomicBase
2322
from variation.vrs_representation import VRSRepresentation
2423

2524

@@ -44,7 +43,6 @@ def __init__(
4443
"""
4544
self.seqrepo_access = seqrepo_access
4645
self.uta = uta
47-
self.genomic_base = GenomicBase(self.seqrepo_access, self.uta)
4846
self.mane_transcript = mane_transcript
4947
self.vrs = vrs
5048
self.hgvs_dup_del_mode = hgvs_dup_del_mode

src/variation/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,8 @@ def get_refget_accession(
208208
else:
209209
if not ids:
210210
errors.append(f"Unable to find ga4gh sequence identifiers for: {alias}")
211-
212-
refget_accession = ids[0].split("ga4gh:")[-1]
211+
else:
212+
refget_accession = ids[0].split("ga4gh:")[-1]
213213
return refget_accession
214214

215215

src/variation/validate.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
"""Module for Validation."""
22

3+
from typing import Literal
4+
35
from cool_seq_tool.handlers import SeqRepoAccess
46
from cool_seq_tool.mappers import LiftOver
57
from cool_seq_tool.sources import TranscriptMappings, UtaDatabase
68
from gene.query import QueryHandler as GeneQueryHandler
79

810
from variation.schemas.classification_response_schema import Classification
11+
from variation.schemas.service_schema import ClinVarAssembly
912
from variation.schemas.validation_response_schema import ValidationSummary
1013
from variation.validators import (
1114
Amplification,
@@ -29,7 +32,7 @@
2932
ProteinStopGain,
3033
ProteinSubstitution,
3134
)
32-
from variation.validators.validator import Validator
35+
from variation.validators.validator import GenomicValidator, Validator
3336

3437

3538
class Validate:
@@ -76,11 +79,18 @@ def __init__(
7679
Amplification(*params),
7780
]
7881

79-
async def perform(self, classification: Classification) -> ValidationSummary:
82+
async def perform(
83+
self,
84+
classification: Classification,
85+
input_assembly: Literal[ClinVarAssembly.GRCH37, ClinVarAssembly.GRCH38]
86+
| None = None,
87+
) -> ValidationSummary:
8088
"""Get validation summary containing invalid and valid results for a
8189
classification
8290
8391
:param classification: A classification for a list of tokens
92+
:param input_assembly: Assembly used for `q`. Only used when `q` is using
93+
genomic free text of gnomad vcf format
8494
:return: Validation summary for classification containing valid and invalid
8595
results
8696
"""
@@ -94,7 +104,15 @@ async def perform(self, classification: Classification) -> ValidationSummary:
94104
if validator.validates_classification_type(
95105
classification.classification_type
96106
):
97-
validation_results = await validator.validate(classification)
107+
if isinstance(validator, GenomicValidator):
108+
validation_results = await validator.validate(
109+
classification, input_assembly=input_assembly
110+
)
111+
else:
112+
validation_results = await validator.validate(
113+
classification,
114+
)
115+
98116
for validation_result in validation_results:
99117
if validation_result.is_valid:
100118
found_valid_result = True

src/variation/validators/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from .cdna_insertion import CdnaInsertion
77
from .cdna_reference_agree import CdnaReferenceAgree
88
from .cdna_substitution import CdnaSubstitution
9-
from .genomic_base import GenomicBase
109
from .genomic_deletion import GenomicDeletion
1110
from .genomic_deletion_ambiguous import GenomicDeletionAmbiguous
1211
from .genomic_delins import GenomicDelIns
@@ -29,7 +28,6 @@
2928
"CdnaInsertion",
3029
"CdnaReferenceAgree",
3130
"CdnaSubstitution",
32-
"GenomicBase",
3331
"GenomicDelIns",
3432
"GenomicDeletion",
3533
"GenomicDeletionAmbiguous",

src/variation/validators/genomic_base.py

Lines changed: 0 additions & 65 deletions
This file was deleted.

src/variation/validators/genomic_deletion.py

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
"""The module for Genomic Deletion Validation."""
22

33
from variation.schemas.classification_response_schema import (
4-
Classification,
54
ClassificationType,
65
GenomicDeletionClassification,
76
Nomenclature,
87
)
98
from variation.schemas.validation_response_schema import ValidationResult
10-
from variation.validators.validator import Validator
9+
from variation.validators.validator import GenomicValidator
1110

1211

13-
class GenomicDeletion(Validator):
12+
class GenomicDeletion(GenomicValidator):
1413
"""The Genomic Deletion Validator class."""
1514

1615
async def get_valid_invalid_results(
@@ -108,21 +107,3 @@ def validates_classification_type(
108107
) -> bool:
109108
"""Return whether or not the classification type is genomic deletion"""
110109
return classification_type == ClassificationType.GENOMIC_DELETION
111-
112-
async def get_accessions(
113-
self, classification: Classification, errors: list
114-
) -> list[str]:
115-
"""Get accessions for a given classification.
116-
If `classification.nomenclature == Nomenclature.HGVS`, will return the accession
117-
in the HGVS expression.
118-
Else, will get all accessions associated to the gene
119-
120-
:param classification: The classification for list of tokens
121-
:param errors: List of errors
122-
:return: List of accessions
123-
"""
124-
if classification.nomenclature == Nomenclature.HGVS:
125-
accessions = [classification.ac]
126-
else:
127-
accessions = await self.get_genomic_accessions(classification, errors)
128-
return accessions

src/variation/validators/genomic_deletion_ambiguous.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,14 @@
22

33
from variation.schemas.classification_response_schema import (
44
AmbiguousType,
5-
Classification,
65
ClassificationType,
76
GenomicDeletionAmbiguousClassification,
8-
Nomenclature,
97
)
108
from variation.schemas.validation_response_schema import ValidationResult
11-
from variation.validators.validator import Validator
9+
from variation.validators.validator import GenomicValidator
1210

1311

14-
class GenomicDeletionAmbiguous(Validator):
12+
class GenomicDeletionAmbiguous(GenomicValidator):
1513
"""The Genomic Deletion Ambiguous Validator class."""
1614

1715
async def get_valid_invalid_results(
@@ -100,21 +98,3 @@ def validates_classification_type(
10098
ambiguous
10199
"""
102100
return classification_type == ClassificationType.GENOMIC_DELETION_AMBIGUOUS
103-
104-
async def get_accessions(
105-
self, classification: Classification, errors: list
106-
) -> list[str]:
107-
"""Get accessions for a given classification.
108-
If `classification.nomenclature == Nomenclature.HGVS`, will return the accession
109-
in the HGVS expression.
110-
Else, will get all accessions associated to the gene
111-
112-
:param classification: The classification for list of tokens
113-
:param errors: List of errors
114-
:return: List of accessions
115-
"""
116-
if classification.nomenclature == Nomenclature.HGVS:
117-
accessions = [classification.ac]
118-
else:
119-
accessions = await self.get_genomic_accessions(classification, errors)
120-
return accessions

src/variation/validators/genomic_delins.py

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
"""The module for Genomic DelIns Validation."""
22

33
from variation.schemas.classification_response_schema import (
4-
Classification,
54
ClassificationType,
65
GenomicDelInsClassification,
76
Nomenclature,
87
)
98
from variation.schemas.validation_response_schema import ValidationResult
10-
from variation.validators.validator import Validator
9+
from variation.validators.validator import GenomicValidator
1110

1211

13-
class GenomicDelIns(Validator):
12+
class GenomicDelIns(GenomicValidator):
1413
"""The Genomic DelIns Validator class."""
1514

1615
async def get_valid_invalid_results(
@@ -79,21 +78,3 @@ def validates_classification_type(
7978
) -> bool:
8079
"""Return whether or not the classification type is genomic delins"""
8180
return classification_type == ClassificationType.GENOMIC_DELINS
82-
83-
async def get_accessions(
84-
self, classification: Classification, errors: list
85-
) -> list[str]:
86-
"""Get accessions for a given classification.
87-
If `classification.nomenclature == Nomenclature.HGVS`, will return the accession
88-
in the HGVS expression.
89-
Else, will get all accessions associated to the gene
90-
91-
:param classification: The classification for list of tokens
92-
:param errors: List of errors
93-
:return: List of accessions
94-
"""
95-
if classification.nomenclature == Nomenclature.HGVS:
96-
accessions = [classification.ac]
97-
else:
98-
accessions = await self.get_genomic_accessions(classification, errors)
99-
return accessions

0 commit comments

Comments
 (0)