Skip to content

Commit b8aca4a

Browse files
authored
Merge pull request #89 from cancervariants/issue-84
Issue 84
2 parents 18fb631 + dfb5403 commit b8aca4a

File tree

91 files changed

+5231
-557
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+5231
-557
lines changed

Pipfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ flake8-docstrings = "*"
1414
pre-commit = "*"
1515
variant-normalization = {editable = true, path = "."}
1616
pyyaml = "*"
17+
jupyter = "*"
18+
ipykernel = "*"
1719

1820
[packages]
1921
hgvs = "*"

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ Services and guidelines for normalizing variant terms
55
Variant Normalization relies on some local data caches which you will need to set up. It uses pipenv to manage its environment, which you will also need to install.
66

77
### Installation
8+
Variant Normalization relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself.
9+
810
From the _variant_ directory of the repository:
911
```
1012
pipenv sync
@@ -18,8 +20,6 @@ sudo mv $seqrepo_date_dir latest
1820
```
1921

2022
### Data
21-
Variant Normalization relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo). We are currently using version `2021-01-29`.
22-
2323
Variant Normalization uses [Ensembl BioMart](http://www.ensembl.org/biomart/martview) to retrieve `variant/data/transcript_mappings.tsv`. We currently use `Human Genes (GRCh38.p13)` for the dataset and the following attributes we use are: Gene stable ID, Gene stable ID version, Transcript stable ID, Transcript stable ID version, Protein stable ID, Protein stable ID version, RefSeq match transcript (MANE Select), Gene name.
2424

2525
![image](biomart.png)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""Module for testing Coding DNA DelIns Classifier."""
2+
import unittest
3+
from variant.classifiers import CodingDNADelInsClassifier
4+
from .classifier_base import ClassifierBase
5+
6+
7+
class TestCodingDNADelInsClassifier(ClassifierBase, unittest.TestCase):
8+
"""A class to test the Coding DNA DelIns Classifier."""
9+
10+
def classifier_instance(self):
11+
"""Return CodingDNADelInsClassifier instance."""
12+
return CodingDNADelInsClassifier()
13+
14+
def fixture_name(self):
15+
"""Return CodingDNADelInsClassifier fixture name."""
16+
return 'coding_dna_delins'
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""Module for testing Coding DNA Silent Mutation Classifier."""
2+
import unittest
3+
from variant.classifiers import CodingDNASilentMutationClassifier
4+
from .classifier_base import ClassifierBase
5+
6+
7+
class TestCodingDNASilentMutationClassifier(ClassifierBase, unittest.TestCase):
8+
"""A class to test the Coding DNA Silent Mutation Classifier."""
9+
10+
def classifier_instance(self):
11+
"""Return CodingDNASilentMutationClassifier instance."""
12+
return CodingDNASilentMutationClassifier()
13+
14+
def fixture_name(self):
15+
"""Return CodingDNASilentMutationClassifier fixture name."""
16+
return 'coding_dna_silent_mutation'
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""Module for testing Coding DNA Substitution Classifier."""
2+
import unittest
3+
from variant.classifiers import CodingDNASubstitutionClassifier
4+
from .classifier_base import ClassifierBase
5+
6+
7+
class TestCodingDNASubstitutionClassifier(ClassifierBase, unittest.TestCase):
8+
"""A class to test the Coding DNA Substitution Classifier."""
9+
10+
def classifier_instance(self):
11+
"""Return CodingDNASubstitutionClassifier instance."""
12+
return CodingDNASubstitutionClassifier()
13+
14+
def fixture_name(self):
15+
"""Return CodingDNASubstitutionClassifier fixture name."""
16+
return 'coding_dna_substitution'
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""Module for testing Genomic DelIns Classifier."""
2+
import unittest
3+
from variant.classifiers import GenomicDelInsClassifier
4+
from .classifier_base import ClassifierBase
5+
6+
7+
class TestGenomicDelInsClassifier(ClassifierBase, unittest.TestCase):
8+
"""A class to test the Genomic DelIns Classifier."""
9+
10+
def classifier_instance(self):
11+
"""Return GenomicDelInsClassifier instance."""
12+
return GenomicDelInsClassifier()
13+
14+
def fixture_name(self):
15+
"""Return GenomicDelInsClassifier fixture name."""
16+
return 'genomic_delins'
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""Module for testing Genomic Silent Mutation Classifier."""
2+
import unittest
3+
from variant.classifiers import GenomicSilentMutationClassifier
4+
from .classifier_base import ClassifierBase
5+
6+
7+
class TestGenomicSilentMutationClassifier(ClassifierBase, unittest.TestCase):
8+
"""A class to test the Genomic Silent Mutation Classifier."""
9+
10+
def classifier_instance(self):
11+
"""Return GenomicSilentMutationClassifier instance."""
12+
return GenomicSilentMutationClassifier()
13+
14+
def fixture_name(self):
15+
"""Return GenomicSilentMutationClassifier fixture name."""
16+
return 'genomic_silent_mutation'
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""Module for testing Genomic Substitution Classifier."""
2+
import unittest
3+
from variant.classifiers import GenomicSubstitutionClassifier
4+
from .classifier_base import ClassifierBase
5+
6+
7+
class TestGenomicSubstitutionClassifier(ClassifierBase, unittest.TestCase):
8+
"""A class to test the Coding DNA Substitution Classifier."""
9+
10+
def classifier_instance(self):
11+
"""Return GenomicSubstitutionClassifier instance."""
12+
return GenomicSubstitutionClassifier()
13+
14+
def fixture_name(self):
15+
"""Return GenomicSubstitutionClassifier fixture name."""
16+
return 'genomic_substitution'

tests/fixtures/classifiers.yml

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ fusion:
1717
- query: fused
1818
- query: fusio
1919

20-
2120
amino_acid_substitution:
2221
should_match:
2322
- query: BRAF V600E
@@ -63,3 +62,97 @@ silent_mutation:
6362
confidence: ConfidenceRating.INTERSECTION
6463
should_not_match:
6564
- query: Leu862==
65+
66+
coding_dna_substitution:
67+
should_match:
68+
- query: V170D (c.509T>A)
69+
confidence: ConfidenceRating.SUPERSET
70+
- query: NM_000551.3:c.292T>C
71+
confidence: ConfidenceRating.EXACT
72+
- query: NM_000551.3:c.292TC
73+
confidence: ConfidenceRating.INTERSECTION
74+
- query: foo Y98H (c.292T>C)
75+
confidence: ConfidenceRating.SUPERSET
76+
- query: BRAF V600E c.23T>A
77+
confidence: ConfidenceRating.EXACT
78+
- query: LRG_199t1:c.54G>H
79+
confidence: ConfidenceRating.EXACT
80+
should_not_match:
81+
- query: V170 (c.509F>A)
82+
- query: RX_:g.292TC
83+
84+
genomic_substitution:
85+
should_match:
86+
- query: V170D (g.509T>A)
87+
confidence: ConfidenceRating.SUPERSET
88+
- query: NC_000017.10:g.292T>C
89+
confidence: ConfidenceRating.EXACT
90+
- query: NC_000017.10:g.292TC
91+
confidence: ConfidenceRating.INTERSECTION
92+
- query: foo Y98H (g.292T>C)
93+
confidence: ConfidenceRating.SUPERSET
94+
- query: BRAF V600E g.23T>A
95+
confidence: ConfidenceRating.EXACT
96+
should_not_match:
97+
- query: V170 (g.509F>A)
98+
- query: RX_:c.292TC
99+
100+
coding_dna_silent_mutation:
101+
should_match:
102+
- query: NM_004006.2:c.123=
103+
confidence: ConfidenceRating.EXACT
104+
- query: foo VHL c.123=
105+
confidence: ConfidenceRating.SUPERSET
106+
should_not_match:
107+
- query: CODING_DNA_:c.123=
108+
- query: g.123=
109+
110+
genomic_silent_mutation:
111+
should_match:
112+
- query: NC_000017.10:g.123=
113+
confidence: ConfidenceRating.EXACT
114+
- query: foo VHL g.123=
115+
confidence: ConfidenceRating.SUPERSET
116+
should_not_match:
117+
- query: GENOMIC_:g.123=
118+
- query: c.123=
119+
120+
coding_dna_delins:
121+
should_match:
122+
- query: NM_005157.6:c.1423_1424delinsGT
123+
confidence: ConfidenceRating.EXACT
124+
- query: ENST00000277541.6:c.7330delinsACA
125+
confidence: ConfidenceRating.EXACT
126+
- query: NM_000797.3:c.812_829delins908_925
127+
confidence: ConfidenceRating.INTERSECTION
128+
- query: foo c.131_234delinsA
129+
confidence: ConfidenceRating.SUPERSET
130+
- query: foo NM_005157.6:c.1423_1424delinsGT
131+
confidence: ConfidenceRating.INTERSECTION
132+
- query: NM_000551.3:c.615delinsAA
133+
confidence: ConfidenceRating.EXACT
134+
- query: LRG_199t1:c.79_80delinsTT
135+
confidence: ConfidenceRating.EXACT
136+
- query: LRG_199:c.79_80delinsTT
137+
confidence: ConfidenceRating.EXACT
138+
should_not_match:
139+
- query: N_005157.6:g.1423_1424delinsGT
140+
- query: c.1423delinsX
141+
142+
genomic_delins:
143+
should_match:
144+
- query: NC_000017.10:g.1423_1424delinsGT
145+
confidence: ConfidenceRating.EXACT
146+
- query: NC_000017.10:g.7330delinsACA
147+
confidence: ConfidenceRating.EXACT
148+
- query: NC_000017.10:g.812_829delins908_925
149+
confidence: ConfidenceRating.INTERSECTION
150+
- query: foo g.131_234delinsA
151+
confidence: ConfidenceRating.SUPERSET
152+
- query: foo NC_000017.10:g.1423_1424delinsGT
153+
confidence: ConfidenceRating.INTERSECTION
154+
- query: NC_000003.12:g.10149938delinsAA
155+
confidence: ConfidenceRating.EXACT
156+
should_not_match:
157+
- query: N_000017.10:c.1423_1424delinsGT
158+
- query: g.1423delinsX

tests/fixtures/tokenizers.yml

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,144 @@ silent_mutation:
8989
hgvs:
9090
should_match:
9191
- token: NC_000007.13:g.36561662C>T
92-
- token: LRG_199p1:p.Trp24Cys
9392
- token: NM_01234.5:c.22+1A>T
9493
- token: NP_000918.2:p.Ile1145=
9594
should_not_match:
9695
- token: NP004324.2
96+
- token: ERBB2:c.2326_2327delinsCT
97+
98+
coding_dna_substitution:
99+
should_match:
100+
- token: (c.292T>C)
101+
- token: c.292T>C
102+
- token: (c.233A>G)
103+
- token: c.509T>A
104+
- token: c.54G>H
105+
should_not_match:
106+
- token: (c.292T>C
107+
- token: g.292T>C
108+
- token: c.292T<C
109+
- token: c.292Z>C
110+
- token: c.j324T<C
111+
- token: 292T<C
112+
- token: c.509T>
113+
- token: c.509>A
114+
- token: c.T>A
115+
116+
genomic_substition:
117+
should_match:
118+
- token: (g.292T>C)
119+
- token: g.292T>C
120+
- token: (g.233A>G)
121+
- token: g.509T>A
122+
- token: g.54G>H
123+
should_not_match:
124+
- token: (g.292T>C
125+
- token: c.292T>C
126+
- token: g.292T<C
127+
- token: g.292Z>C
128+
- token: g.j324T<C
129+
- token: 292T<C
130+
- token: g.509T>
131+
- token: g.509>A
132+
- token: g.T>A
133+
134+
coding_dna_silent_mutation:
135+
should_match:
136+
- token: c.123=
137+
- token: (c.123=)
138+
should_not_match:
139+
- token: c.292T>C
140+
- token: g.292T>C
141+
- token: g.123
142+
- token: (c.123=
143+
- token: c.123=)
144+
- token: c.123
145+
- token: c.123==
146+
147+
genomic_silent_mutation:
148+
should_match:
149+
- token: g.123=
150+
- token: (g.123=)
151+
should_not_match:
152+
- token: c.292T>C
153+
- token: c.292T>C
154+
- token: c.123
155+
- token: (g.123=
156+
- token: g.123=)
157+
- token: g.123
158+
- token: g.123==
159+
160+
161+
coding_dna_delins:
162+
should_match:
163+
- token: c.32386323delinsGA
164+
- token: c.6775_6777delinsC
165+
- token: c.145_147delinsTGG
166+
- token: c.9002_9009delinsTTT
167+
- token: c.850_901delinsTTCCTCGATGCCTG
168+
# - token: c.42522624_42522669delins42536337_42536382
169+
# - token: c.812_829delins908_925
170+
- token: (c.301_302delinsGG)
171+
- token: c.615delinsAA
172+
should_not_match:
173+
- token: c.150_147delinsTGG
174+
- token: 32386323delinsGA
175+
- token: c.145_147delinsTGGS
176+
- token: c.145_147delTGG
177+
- token: g.32386323delinsGA
178+
- token: NM_000797.3:c.812_829delins908_
179+
- token: c.42522624_42522669delins_42536382
180+
- token: c.delinsGA
181+
- token: c.32386323delins
182+
- token: (c.301_302delinsGG
183+
- token: c.delins
184+
- token: delins
185+
- token: c._147delinsTGG
186+
- token: c.145_delinsTGG
187+
- token: c.delinsTGG
188+
- token: c.d_delinsTG
189+
190+
191+
genomic_delins:
192+
should_match:
193+
- token: g.32386323delinsGA
194+
- token: g.6775_6777delinsC
195+
- token: g.145_147delinsTGG
196+
- token: g.9002_9009delinsTTT
197+
- token: g.850_901delinsTTCCTCGATGCCTG
198+
# - token: g.42522624_42522669delins42536337_42536382
199+
# - token: g.812_829delins908_925
200+
- token: (g.301_302delinsGG)
201+
- token: g.10149938delinsAA
202+
should_not_match:
203+
- token: g.150_147delinsTGG
204+
- token: 32386323delinsGA
205+
- token: g.145_147delinsTGGS
206+
- token: g.145_147delTGG
207+
- token: c.32386323delinsGA
208+
- token: NM_000797.3:g.812_829delins908_
209+
- token: g.42522624_42522669delins_42536382
210+
- token: g.delinsGA
211+
- token: g.32386323delins
212+
- token: (g.301_302delinsGG
213+
- token: g.delins
214+
- token: delins
215+
- token: g._147delinsTGG
216+
- token: g.145_delinsTGG
217+
- token: g.delinsTGG
218+
- token: g.d_delinsTG
219+
220+
locus_reference_genomic:
221+
should_match:
222+
- token: LRG_199
223+
- token: LRG_199t1
224+
- token: LRG_199p1
225+
should_not_match:
226+
- token: LRG_199t1p1
227+
- token: LRG_199p1t1
228+
- token: LRG_
229+
- token: LRG_t1
230+
- token: LRG_p1
231+
- token: LRGt1
232+
- token: LRGp1

0 commit comments

Comments
 (0)