Additional Fixes (#76)

* Shift to using externally controlled mzpaf library * Fix read seeking and byte the bullet for offset tracking * schema moved
HUPO-PSI · Mar 8, 2024 · 911198e · 911198e
1 parent 70a2330
commit 911198e
Show file tree

Hide file tree

Showing 10 changed files with 259 additions and 57 deletions.
diff --git a/examples/NIST/IARPA3_best_tissue_add_info.head.mzlib.txt b/examples/NIST/IARPA3_best_tissue_add_info.head.mzlib.txt
@@ -1,4 +1,5 @@
 <mzSpecLib 1.0>
+MS:1003186|library format version=1.0
 MS:1003188|library name=IARPA3_best_tissue_add_info
 MS:1003191|library URI=https://chemdata.nist.gov/dokuwiki/doku.php?id=peptidew:lib:human_skin_hair
 MS:1001017|release date=Oct. 01, 2021

diff --git a/implementations/python/mzlib/attributes.py b/implementations/python/mzlib/attributes.py
@@ -2,7 +2,7 @@
 
 from typing import (
     Any, DefaultDict, Iterable,
-    Iterator, Optional, Tuple,
+    Iterator, Optional, Set, Tuple,
     Union, List, Dict,
     Generic, TypeVar, Type
 )
@@ -147,6 +147,10 @@ def add_attribute(self, key: str, value, group_identifier: Optional[str] = None)
             The attribute group identifier to use, if any. If not provided,
             no group is assumed.
         """
+        if group_identifier is not None:
+            int_group_identifier = int(group_identifier)
+            if int_group_identifier <= self.group_counter:
+                self.group_counter = int_group_identifier + 1
         items = Attribute(key, value, group_identifier)
         self.attributes.append(items)
         index = len(self.attributes) - 1
@@ -550,6 +554,9 @@ def add_attribute(self, key, value, group_identifier=None) -> Union[Any, List[An
         """
         return self.attributes.add_attribute(key, value, group_identifier=group_identifier)
 
+    def add_attribute_group(self, attributes: List[Union[Attribute, Tuple[str, Any]]]):
+        self.attributes.add_attribute_group(attributes)
+
     def replace_attribute(self, key, value, group_identifier=None):
         return self.attributes.replace_attribute(key, value, group_identifier=group_identifier)
 
@@ -717,10 +724,19 @@ def __init__(self, attributes):
 
 class AttributeSet(AttributedEntity):
     name: str
+    _names_to_override: Set[str]
 
     def __init__(self, name: str, attributes: Iterable = None, **kwargs):
         super().__init__(attributes, **kwargs)
         self.name = name
+        self._names_to_override = self._get_names_to_override()
+
+    def _get_names_to_override(self):
+        keys = set()
+        for attr in self.attributes:
+            if attr.group_id is None:
+                keys.add(attr.key)
+        return keys
 
     def member_of(self, target: Attributed) -> bool:
         for attrib in self.attributes:
@@ -730,7 +746,8 @@ def member_of(self, target: Attributed) -> bool:
                 return False
         return True
 
-    def apply(self, target: Attributed, ):
+    def apply(self, target: Attributed):
+
         terms_to_remove: List[Tuple[str, Union[Attribute, List[Attribute]]]] = []
         for key in self.attributes.keys():
             if target.has_attribute(key):

diff --git a/implementations/python/mzlib/backends/base.py b/implementations/python/mzlib/backends/base.py
@@ -56,7 +56,11 @@ def __new__(mcs, name, parents, attrs):
 
         file_extension = attrs.get("file_format")
         if file_extension is not None:
-            new_type._file_extension_to_implementation[file_extension] = new_type
+            if isinstance(file_extension, list):
+                for ext in file_extension:
+                    new_type._file_extension_to_implementation[ext] = new_type
+            else:
+                new_type._file_extension_to_implementation[file_extension] = new_type
 
         format_name = attrs.get("format_name")
         if format_name is not None:
@@ -126,6 +130,8 @@ def guess_from_filename(cls, filename: Union[str, Path, io.FileIO]) -> bool:
             return False
         if filename.endswith(".gz"):
             filename = filename[:-3]
+        if isinstance(cls.file_format, list):
+            return any(filename.endswith(ext) for ext in cls.file_format)
         return filename.endswith(cls.file_format)
 
     @classmethod
@@ -334,7 +340,7 @@ def __getitem__(self, i) -> Union[Spectrum, List[Spectrum]]:
     @classmethod
     def has_index_preference(cls, filename: Union[str, Path, io.FileIO]) -> Type[IndexBase]:
         """
-        Does this backend prefer a particular index for this file?
+        Check if this backend prefers a particular index for this file.
 
         The base implementation checks to see if there is a SQL index
         for the filename provided, and if so, prefers :class:`~.SQLIndex`.
@@ -542,6 +548,8 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None) -
             offset = self.index.offset_for(spectrum_number)
         elif spectrum_name is not None:
             offset = self.index.offset_for(spectrum_name)
+        else:
+            raise ValueError("Must provide either spectrum_number or spectrum_name argument")
         buffer = self._get_lines_for(offset)
         spectrum = self._parse_from_buffer(buffer, spectrum_number)
         return spectrum
@@ -670,6 +678,8 @@ def close(self):
 
 
 class LibraryIterator(AttributedEntity, _LibraryViewMixin, Iterator[Spectrum]):
+    """An iterator wrapper for a library source that doesn't permit random access"""
+
     backend: SpectralLibraryBackendBase
     attributes: Attributed
     iter: Iterator[Spectrum]

diff --git a/implementations/python/mzlib/backends/bibliospec.py b/implementations/python/mzlib/backends/bibliospec.py
@@ -134,7 +134,10 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
         """
         if spectrum_number is None:
             raise ValueError("Only spectrum number queries are supported. spectrum_number must have an integer value")
-
+        try:
+            spectrum_number = int(spectrum_number)
+        except (ValueError, TypeError):
+            raise ValueError(f"spectrum_number must have an integer value, received {spectrum_number!r}") from None
         info = self.connection.execute("SELECT * FROM RefSpectra WHERE id = ?", (spectrum_number, )).fetchone()
         spectrum = self._new_spectrum()
         spectrum.key = info['id']

diff --git a/implementations/python/mzlib/backends/encyclopedia.py b/implementations/python/mzlib/backends/encyclopedia.py
@@ -131,6 +131,10 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
         """
         if spectrum_number is None:
             raise ValueError("Only spectrum number queries are supported. spectrum_number must have an integer value")
+        try:
+            spectrum_number = int(spectrum_number)
+        except (TypeError, ValueError):
+            raise ValueError(f"spectrum_number must have an integer value, received {spectrum_number!r}") from None
 
         info = self.connection.execute("SELECT rowid, * FROM entries WHERE rowid = ?;", (spectrum_number, )).fetchone()
         spectrum = self._new_spectrum()

diff --git a/implementations/python/mzlib/backends/json.py b/implementations/python/mzlib/backends/json.py
@@ -9,7 +9,7 @@
 from mzlib.cluster import SpectrumCluster
 
 from mzlib.index import MemoryIndex
-from mzlib.attributes import AttributeManager, Attributed
+from mzlib.attributes import AttributeManager, Attributed, AttributeSet
 from mzlib.annotation import parse_annotation, IonAnnotationBase
 from mzlib.analyte import Analyte, Interpretation, FIRST_INTERPRETATION_KEY
 from mzlib.spectrum import Spectrum
@@ -46,7 +46,21 @@
 
 
 class JSONSpectralLibrary(SpectralLibraryBackendBase):
-    file_format = "mzlb.json"
+    """
+    A reader for the JSON serialization of the mzSpecLib spectral library foramt.
+
+    .. note::
+
+        Unlike other formats readers, this type does not parse incrementally, it instead
+        parses the entire JSON document in-memory and stores the parsed object structure.
+        The JSON objects are then converted into :mod:`mzlib` types upon request. This is
+        because incremental JSON parsing is substantially more difficult to do in a byte
+        aware manner, not to mention slow, in Python.
+
+        This may lead to large memory overhead when reading large libraries in JSON format.
+    """
+
+    file_format = ["mzlb.json", "mzlib.json"]
     format_name = "json"
 
     def __init__(self, filename, index_type=None, read_metadata=True):
@@ -79,9 +93,17 @@ def _load_buffer(self, filename_or_stream: Union[str, Path, io.FileIO, Mapping])
             self.buffer = json.load(self.handle)
             self.handle.close()
 
+    def _load_attribute_sets(self, attribute_sets: dict):
+        return {
+            k: self._fill_attributes(v, AttributeSet(k, [])) for k, v in attribute_sets.items()
+        }
+
     def read_header(self) -> bool:
         if self.buffer:
             self._fill_attributes(self.buffer.get(LIBRARY_METADATA_KEY), self.attributes)
+            self.analyte_attribute_sets.update(self._load_attribute_sets(self.buffer.get(ANALYTE_CLASSES, {})))
+            self.spectrum_attribute_sets.update(self._load_attribute_sets(self.buffer.get(SPECTRUM_CLASSES, {})))
+            self.interpretation_attribute_sets.update(self._load_attribute_sets(self.buffer.get(INTERPRETATION_CLASSES, {})))
             return True
         return False
 
@@ -135,6 +157,8 @@ def get_spectrum(self, spectrum_number: int=None, spectrum_name: str=None) -> Sp
             offset = self.index.offset_for(spectrum_number)
         elif spectrum_name is not None:
             offset = self.index.offset_for(spectrum_name)
+        else:
+            raise ValueError("Must provide either spectrum_number or spectrum_name argument")
         data = self.buffer[SPECTRA_KEY][offset]
         spectrum = self._make_spectrum_from_payload(data)
         return spectrum
@@ -147,6 +171,8 @@ def get_cluster(self, cluster_number: int) -> SpectrumCluster:
 
     def _fill_attributes(self, attributes: List[Dict[str, Any]], store: Attributed,
                          context_type: AttributeSetTypes=None) -> Attributed:
+        last_group_id = None
+        current_group_id = None
         for attrib in attributes:
             if attrib['accession'] == "MS:1003212":
                 if context_type == AttributeSetTypes.analyte:
@@ -165,13 +191,17 @@ def _fill_attributes(self, attributes: List[Dict[str, Any]], store: Attributed,
                     value = f'{attrib["value_accession"]}|{attrib["value"]}'
                 else:
                     value = attrib['value']
-                # TODO: When applying an attribute set with a group in it, we
-                # may collide with an existing (hard-coded) group identifier.
-                # This behavior probably exists in the text format too.
+
                 group = attrib.get("cv_param_group")
-                store.add_attribute(key, value, group_identifier=group)
                 if group is not None:
-                    store.group_counter = int(group)
+                    if group != last_group_id:
+                        current_group_id = store.get_next_group_identifier()
+                        last_group_id = group
+                        group = current_group_id
+                    else:
+                        group = current_group_id
+
+                store.add_attribute(key, value, group_identifier=group)
         return store
 
     def _make_analyte_from_payload(self, analyte_id, analyte_d: Dict) -> Analyte:
@@ -269,6 +299,18 @@ def read(self):
 
 
 class JSONSpectralLibraryWriter(SpectralLibraryWriterBase):
+    """
+    Write a spectral library to the JSON serialization of the mzSpecLib spectral library foramt.
+
+    .. note::
+
+        Unlike other format writers, this writer buffers the entire library in memory as JSON-compatible
+        Python objects until the entire library is ready to be written out. This is because incrementally
+        writing JSON is substantially more difficult to do correctly.
+
+        This may lead to large memory overhead when writing large libraries in JSON format.
+    """
+
     file_format = "mzlb.json"
     format_name = "json"
     default_version = '1.0'

diff --git a/implementations/python/mzlib/backends/msp.py b/implementations/python/mzlib/backends/msp.py
@@ -803,13 +803,20 @@ class _UnknownTermTracker:
     counts: DefaultDict
 
     def add(self, key: str, value: Optional[str]=None):
+        """Add an unknown attribute to the tracker"""
         raise NotImplementedError()
 
     def items(self):
         return self.counts.items()
 
 
 class UnknownKeyValueTracker(_UnknownTermTracker):
+    """
+    A diagnostic tool for tracking attributes with values that the parser doesn't know how to interpret.
+
+    This tracker holds both keys and values, and can grow quite large. For debugging purposes only.
+    """
+
     def __init__(self) -> None:
         self.counts = DefaultDict(lambda: DefaultDict(int))
 
@@ -818,6 +825,8 @@ def add(self, key: str, value: Optional[str]=None):
 
 
 class UnknownKeyTracker(_UnknownTermTracker):
+    """A diagnostic tool for tracking attributes that the parser doesn't know how to interpret."""
+
     def __init__(self) -> None:
         self.counts = DefaultDict(int)
 
@@ -836,6 +845,28 @@ def add(self, key: str, value: Optional[str] = None):
 
 
 class MSPSpectralLibrary(_PlainTextSpectralLibraryBackendBase):
+    """
+    A reader for the plain text NIST MSP spectral library format.
+
+    The MSP format is only roughly defined, and does places few
+    constraints on the meanings of spectrum attributes. This parser
+    attempts to cover a variety of different ways that MSPs found
+    "in the wild" have denoted different spectrum properties, but
+    is neither exhaustive nor nuanced enough to know from context
+    exactly what those files' authors intended, making a best guess
+    at when they correspond to in the controlled vocabulary mapping
+    for :mod:`mzlib`
+
+
+    Attributes
+    ----------
+    modification_parser : :class:`ModificationParser`
+        A parser for peptide modifications
+    unknown_attributes : :class:`_UnknownTermTracker`
+        A tracker for unknown attributes. Used to tell how much information
+        the reader is unable to map onto the controlled vocabulary.
+    """
+
     file_format = "msp"
     format_name = "msp"
 
@@ -1192,6 +1223,35 @@ def _parse_comment(self, value: str, attributes: Attributed):
                 attributes[item] = None
 
     def _make_attribute_handlers(self):
+        """
+        Create the attribute handling scopes that map this flavor of MSP's
+        attributes onto controlled vocabulary terms in context.
+
+        This method should be overridden in sub-classes to allow them
+        to change the meanings of attributes, add new ones, or otherwise
+        redirect how they are interpreted.
+
+        See the :class:`AttributeHandler` type tree for more details about
+        how the distributed predicates are resolved.
+
+        Returns
+        -------
+        other_manager : :class:`AttributeHandler`
+            The attribute handler for uncategorized attributes that will be added
+            to a :class:`Spectrum`.
+        analyte_manager : :class:`AttributeHandler`
+            The attribute handler for attributes that will be added to a :class:`Analyte`
+        interpretation_manager : :class:`AttributeHandler`
+            The attribute handler for attributes that will be added to a :class:`Interpretation`
+        interpretation_member_manager : :class:`AttributeHandler`
+            The attribute handler for attributes that will be added to a :class:`InterpretationMember`
+        spectrum_manager : :class:`AttributeHandler`
+            The attribute handler for attributes that will be added to a :class:`Spectrum`
+        analyte_fallback_manager : :class:`AttributeHandler`
+            The attribute handler for attributes that will be tried for any attribute
+            that fails to be categorized by all of the other managers to be added to the
+            :class:`Analyte` before labeling the attribute as "unknown".
+        """
         other_manager = MappingAttributeHandler(other_terms)
         analyte_manager = MappingAttributeHandler(analyte_terms)
         interpretation_manager = MappingAttributeHandler(interpretation_terms)
@@ -1414,6 +1474,8 @@ def get_spectrum(self, spectrum_number: int=None, spectrum_name: str=None) -> Sp
             index_record = self.index.record_for(spectrum_name)
             spectrum_number = index_record.number
             offset = index_record.offset
+        else:
+            raise ValueError("Must provide either spectrum_number or spectrum_name argument")
         buffer = self._get_lines_for(offset)
         spectrum = self._parse(buffer, index_record.index)
         return spectrum