Skip to content

Commit

Permalink
Additional Fixes (#76)
Browse files Browse the repository at this point in the history
* Shift to using externally controlled mzpaf library

* Fix read seeking and byte the bullet for offset tracking

* schema moved
  • Loading branch information
mobiusklein committed Mar 8, 2024
1 parent 70a2330 commit 911198e
Show file tree
Hide file tree
Showing 10 changed files with 259 additions and 57 deletions.
1 change: 1 addition & 0 deletions examples/NIST/IARPA3_best_tissue_add_info.head.mzlib.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<mzSpecLib 1.0>
MS:1003186|library format version=1.0
MS:1003188|library name=IARPA3_best_tissue_add_info
MS:1003191|library URI=https://chemdata.nist.gov/dokuwiki/doku.php?id=peptidew:lib:human_skin_hair
MS:1001017|release date=Oct. 01, 2021
Expand Down
21 changes: 19 additions & 2 deletions implementations/python/mzlib/attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import (
Any, DefaultDict, Iterable,
Iterator, Optional, Tuple,
Iterator, Optional, Set, Tuple,
Union, List, Dict,
Generic, TypeVar, Type
)
Expand Down Expand Up @@ -147,6 +147,10 @@ def add_attribute(self, key: str, value, group_identifier: Optional[str] = None)
The attribute group identifier to use, if any. If not provided,
no group is assumed.
"""
if group_identifier is not None:
int_group_identifier = int(group_identifier)
if int_group_identifier <= self.group_counter:
self.group_counter = int_group_identifier + 1
items = Attribute(key, value, group_identifier)
self.attributes.append(items)
index = len(self.attributes) - 1
Expand Down Expand Up @@ -550,6 +554,9 @@ def add_attribute(self, key, value, group_identifier=None) -> Union[Any, List[An
"""
return self.attributes.add_attribute(key, value, group_identifier=group_identifier)

def add_attribute_group(self, attributes: List[Union[Attribute, Tuple[str, Any]]]):
self.attributes.add_attribute_group(attributes)

def replace_attribute(self, key, value, group_identifier=None):
return self.attributes.replace_attribute(key, value, group_identifier=group_identifier)

Expand Down Expand Up @@ -717,10 +724,19 @@ def __init__(self, attributes):

class AttributeSet(AttributedEntity):
name: str
_names_to_override: Set[str]

def __init__(self, name: str, attributes: Iterable = None, **kwargs):
super().__init__(attributes, **kwargs)
self.name = name
self._names_to_override = self._get_names_to_override()

def _get_names_to_override(self):
keys = set()
for attr in self.attributes:
if attr.group_id is None:
keys.add(attr.key)
return keys

def member_of(self, target: Attributed) -> bool:
for attrib in self.attributes:
Expand All @@ -730,7 +746,8 @@ def member_of(self, target: Attributed) -> bool:
return False
return True

def apply(self, target: Attributed, ):
def apply(self, target: Attributed):

terms_to_remove: List[Tuple[str, Union[Attribute, List[Attribute]]]] = []
for key in self.attributes.keys():
if target.has_attribute(key):
Expand Down
14 changes: 12 additions & 2 deletions implementations/python/mzlib/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ def __new__(mcs, name, parents, attrs):

file_extension = attrs.get("file_format")
if file_extension is not None:
new_type._file_extension_to_implementation[file_extension] = new_type
if isinstance(file_extension, list):
for ext in file_extension:
new_type._file_extension_to_implementation[ext] = new_type
else:
new_type._file_extension_to_implementation[file_extension] = new_type

format_name = attrs.get("format_name")
if format_name is not None:
Expand Down Expand Up @@ -126,6 +130,8 @@ def guess_from_filename(cls, filename: Union[str, Path, io.FileIO]) -> bool:
return False
if filename.endswith(".gz"):
filename = filename[:-3]
if isinstance(cls.file_format, list):
return any(filename.endswith(ext) for ext in cls.file_format)
return filename.endswith(cls.file_format)

@classmethod
Expand Down Expand Up @@ -334,7 +340,7 @@ def __getitem__(self, i) -> Union[Spectrum, List[Spectrum]]:
@classmethod
def has_index_preference(cls, filename: Union[str, Path, io.FileIO]) -> Type[IndexBase]:
"""
Does this backend prefer a particular index for this file?
Check if this backend prefers a particular index for this file.
The base implementation checks to see if there is a SQL index
for the filename provided, and if so, prefers :class:`~.SQLIndex`.
Expand Down Expand Up @@ -542,6 +548,8 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None) -
offset = self.index.offset_for(spectrum_number)
elif spectrum_name is not None:
offset = self.index.offset_for(spectrum_name)
else:
raise ValueError("Must provide either spectrum_number or spectrum_name argument")
buffer = self._get_lines_for(offset)
spectrum = self._parse_from_buffer(buffer, spectrum_number)
return spectrum
Expand Down Expand Up @@ -670,6 +678,8 @@ def close(self):


class LibraryIterator(AttributedEntity, _LibraryViewMixin, Iterator[Spectrum]):
"""An iterator wrapper for a library source that doesn't permit random access"""

backend: SpectralLibraryBackendBase
attributes: Attributed
iter: Iterator[Spectrum]
Expand Down
5 changes: 4 additions & 1 deletion implementations/python/mzlib/backends/bibliospec.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,10 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
"""
if spectrum_number is None:
raise ValueError("Only spectrum number queries are supported. spectrum_number must have an integer value")

try:
spectrum_number = int(spectrum_number)
except (ValueError, TypeError):
raise ValueError(f"spectrum_number must have an integer value, received {spectrum_number!r}") from None
info = self.connection.execute("SELECT * FROM RefSpectra WHERE id = ?", (spectrum_number, )).fetchone()
spectrum = self._new_spectrum()
spectrum.key = info['id']
Expand Down
4 changes: 4 additions & 0 deletions implementations/python/mzlib/backends/encyclopedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
"""
if spectrum_number is None:
raise ValueError("Only spectrum number queries are supported. spectrum_number must have an integer value")
try:
spectrum_number = int(spectrum_number)
except (TypeError, ValueError):
raise ValueError(f"spectrum_number must have an integer value, received {spectrum_number!r}") from None

info = self.connection.execute("SELECT rowid, * FROM entries WHERE rowid = ?;", (spectrum_number, )).fetchone()
spectrum = self._new_spectrum()
Expand Down
56 changes: 49 additions & 7 deletions implementations/python/mzlib/backends/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from mzlib.cluster import SpectrumCluster

from mzlib.index import MemoryIndex
from mzlib.attributes import AttributeManager, Attributed
from mzlib.attributes import AttributeManager, Attributed, AttributeSet
from mzlib.annotation import parse_annotation, IonAnnotationBase
from mzlib.analyte import Analyte, Interpretation, FIRST_INTERPRETATION_KEY
from mzlib.spectrum import Spectrum
Expand Down Expand Up @@ -46,7 +46,21 @@


class JSONSpectralLibrary(SpectralLibraryBackendBase):
file_format = "mzlb.json"
"""
A reader for the JSON serialization of the mzSpecLib spectral library foramt.
.. note::
Unlike other formats readers, this type does not parse incrementally, it instead
parses the entire JSON document in-memory and stores the parsed object structure.
The JSON objects are then converted into :mod:`mzlib` types upon request. This is
because incremental JSON parsing is substantially more difficult to do in a byte
aware manner, not to mention slow, in Python.
This may lead to large memory overhead when reading large libraries in JSON format.
"""

file_format = ["mzlb.json", "mzlib.json"]
format_name = "json"

def __init__(self, filename, index_type=None, read_metadata=True):
Expand Down Expand Up @@ -79,9 +93,17 @@ def _load_buffer(self, filename_or_stream: Union[str, Path, io.FileIO, Mapping])
self.buffer = json.load(self.handle)
self.handle.close()

def _load_attribute_sets(self, attribute_sets: dict):
return {
k: self._fill_attributes(v, AttributeSet(k, [])) for k, v in attribute_sets.items()
}

def read_header(self) -> bool:
if self.buffer:
self._fill_attributes(self.buffer.get(LIBRARY_METADATA_KEY), self.attributes)
self.analyte_attribute_sets.update(self._load_attribute_sets(self.buffer.get(ANALYTE_CLASSES, {})))
self.spectrum_attribute_sets.update(self._load_attribute_sets(self.buffer.get(SPECTRUM_CLASSES, {})))
self.interpretation_attribute_sets.update(self._load_attribute_sets(self.buffer.get(INTERPRETATION_CLASSES, {})))
return True
return False

Expand Down Expand Up @@ -135,6 +157,8 @@ def get_spectrum(self, spectrum_number: int=None, spectrum_name: str=None) -> Sp
offset = self.index.offset_for(spectrum_number)
elif spectrum_name is not None:
offset = self.index.offset_for(spectrum_name)
else:
raise ValueError("Must provide either spectrum_number or spectrum_name argument")
data = self.buffer[SPECTRA_KEY][offset]
spectrum = self._make_spectrum_from_payload(data)
return spectrum
Expand All @@ -147,6 +171,8 @@ def get_cluster(self, cluster_number: int) -> SpectrumCluster:

def _fill_attributes(self, attributes: List[Dict[str, Any]], store: Attributed,
context_type: AttributeSetTypes=None) -> Attributed:
last_group_id = None
current_group_id = None
for attrib in attributes:
if attrib['accession'] == "MS:1003212":
if context_type == AttributeSetTypes.analyte:
Expand All @@ -165,13 +191,17 @@ def _fill_attributes(self, attributes: List[Dict[str, Any]], store: Attributed,
value = f'{attrib["value_accession"]}|{attrib["value"]}'
else:
value = attrib['value']
# TODO: When applying an attribute set with a group in it, we
# may collide with an existing (hard-coded) group identifier.
# This behavior probably exists in the text format too.

group = attrib.get("cv_param_group")
store.add_attribute(key, value, group_identifier=group)
if group is not None:
store.group_counter = int(group)
if group != last_group_id:
current_group_id = store.get_next_group_identifier()
last_group_id = group
group = current_group_id
else:
group = current_group_id

store.add_attribute(key, value, group_identifier=group)
return store

def _make_analyte_from_payload(self, analyte_id, analyte_d: Dict) -> Analyte:
Expand Down Expand Up @@ -269,6 +299,18 @@ def read(self):


class JSONSpectralLibraryWriter(SpectralLibraryWriterBase):
"""
Write a spectral library to the JSON serialization of the mzSpecLib spectral library foramt.
.. note::
Unlike other format writers, this writer buffers the entire library in memory as JSON-compatible
Python objects until the entire library is ready to be written out. This is because incrementally
writing JSON is substantially more difficult to do correctly.
This may lead to large memory overhead when writing large libraries in JSON format.
"""

file_format = "mzlb.json"
format_name = "json"
default_version = '1.0'
Expand Down
62 changes: 62 additions & 0 deletions implementations/python/mzlib/backends/msp.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,13 +803,20 @@ class _UnknownTermTracker:
counts: DefaultDict

def add(self, key: str, value: Optional[str]=None):
"""Add an unknown attribute to the tracker"""
raise NotImplementedError()

def items(self):
return self.counts.items()


class UnknownKeyValueTracker(_UnknownTermTracker):
"""
A diagnostic tool for tracking attributes with values that the parser doesn't know how to interpret.
This tracker holds both keys and values, and can grow quite large. For debugging purposes only.
"""

def __init__(self) -> None:
self.counts = DefaultDict(lambda: DefaultDict(int))

Expand All @@ -818,6 +825,8 @@ def add(self, key: str, value: Optional[str]=None):


class UnknownKeyTracker(_UnknownTermTracker):
"""A diagnostic tool for tracking attributes that the parser doesn't know how to interpret."""

def __init__(self) -> None:
self.counts = DefaultDict(int)

Expand All @@ -836,6 +845,28 @@ def add(self, key: str, value: Optional[str] = None):


class MSPSpectralLibrary(_PlainTextSpectralLibraryBackendBase):
"""
A reader for the plain text NIST MSP spectral library format.
The MSP format is only roughly defined, and does places few
constraints on the meanings of spectrum attributes. This parser
attempts to cover a variety of different ways that MSPs found
"in the wild" have denoted different spectrum properties, but
is neither exhaustive nor nuanced enough to know from context
exactly what those files' authors intended, making a best guess
at when they correspond to in the controlled vocabulary mapping
for :mod:`mzlib`
Attributes
----------
modification_parser : :class:`ModificationParser`
A parser for peptide modifications
unknown_attributes : :class:`_UnknownTermTracker`
A tracker for unknown attributes. Used to tell how much information
the reader is unable to map onto the controlled vocabulary.
"""

file_format = "msp"
format_name = "msp"

Expand Down Expand Up @@ -1192,6 +1223,35 @@ def _parse_comment(self, value: str, attributes: Attributed):
attributes[item] = None

def _make_attribute_handlers(self):
"""
Create the attribute handling scopes that map this flavor of MSP's
attributes onto controlled vocabulary terms in context.
This method should be overridden in sub-classes to allow them
to change the meanings of attributes, add new ones, or otherwise
redirect how they are interpreted.
See the :class:`AttributeHandler` type tree for more details about
how the distributed predicates are resolved.
Returns
-------
other_manager : :class:`AttributeHandler`
The attribute handler for uncategorized attributes that will be added
to a :class:`Spectrum`.
analyte_manager : :class:`AttributeHandler`
The attribute handler for attributes that will be added to a :class:`Analyte`
interpretation_manager : :class:`AttributeHandler`
The attribute handler for attributes that will be added to a :class:`Interpretation`
interpretation_member_manager : :class:`AttributeHandler`
The attribute handler for attributes that will be added to a :class:`InterpretationMember`
spectrum_manager : :class:`AttributeHandler`
The attribute handler for attributes that will be added to a :class:`Spectrum`
analyte_fallback_manager : :class:`AttributeHandler`
The attribute handler for attributes that will be tried for any attribute
that fails to be categorized by all of the other managers to be added to the
:class:`Analyte` before labeling the attribute as "unknown".
"""
other_manager = MappingAttributeHandler(other_terms)
analyte_manager = MappingAttributeHandler(analyte_terms)
interpretation_manager = MappingAttributeHandler(interpretation_terms)
Expand Down Expand Up @@ -1414,6 +1474,8 @@ def get_spectrum(self, spectrum_number: int=None, spectrum_name: str=None) -> Sp
index_record = self.index.record_for(spectrum_name)
spectrum_number = index_record.number
offset = index_record.offset
else:
raise ValueError("Must provide either spectrum_number or spectrum_name argument")
buffer = self._get_lines_for(offset)
spectrum = self._parse(buffer, index_record.index)
return spectrum
Expand Down
Loading

0 comments on commit 911198e

Please sign in to comment.