diff --git a/examples/NIST/IARPA3_best_tissue_add_info.head.mzlib.txt b/examples/NIST/IARPA3_best_tissue_add_info.head.mzlib.txt index cc7a5b1..54919fa 100644 --- a/examples/NIST/IARPA3_best_tissue_add_info.head.mzlib.txt +++ b/examples/NIST/IARPA3_best_tissue_add_info.head.mzlib.txt @@ -1,4 +1,5 @@ +MS:1003186|library format version=1.0 MS:1003188|library name=IARPA3_best_tissue_add_info MS:1003191|library URI=https://chemdata.nist.gov/dokuwiki/doku.php?id=peptidew:lib:human_skin_hair MS:1001017|release date=Oct. 01, 2021 diff --git a/implementations/python/mzlib/attributes.py b/implementations/python/mzlib/attributes.py index 097ac50..6106fb1 100644 --- a/implementations/python/mzlib/attributes.py +++ b/implementations/python/mzlib/attributes.py @@ -2,7 +2,7 @@ from typing import ( Any, DefaultDict, Iterable, - Iterator, Optional, Tuple, + Iterator, Optional, Set, Tuple, Union, List, Dict, Generic, TypeVar, Type ) @@ -147,6 +147,10 @@ def add_attribute(self, key: str, value, group_identifier: Optional[str] = None) The attribute group identifier to use, if any. If not provided, no group is assumed. """ + if group_identifier is not None: + int_group_identifier = int(group_identifier) + if int_group_identifier <= self.group_counter: + self.group_counter = int_group_identifier + 1 items = Attribute(key, value, group_identifier) self.attributes.append(items) index = len(self.attributes) - 1 @@ -550,6 +554,9 @@ def add_attribute(self, key, value, group_identifier=None) -> Union[Any, List[An """ return self.attributes.add_attribute(key, value, group_identifier=group_identifier) + def add_attribute_group(self, attributes: List[Union[Attribute, Tuple[str, Any]]]): + self.attributes.add_attribute_group(attributes) + def replace_attribute(self, key, value, group_identifier=None): return self.attributes.replace_attribute(key, value, group_identifier=group_identifier) @@ -717,10 +724,19 @@ def __init__(self, attributes): class AttributeSet(AttributedEntity): name: str + _names_to_override: Set[str] def __init__(self, name: str, attributes: Iterable = None, **kwargs): super().__init__(attributes, **kwargs) self.name = name + self._names_to_override = self._get_names_to_override() + + def _get_names_to_override(self): + keys = set() + for attr in self.attributes: + if attr.group_id is None: + keys.add(attr.key) + return keys def member_of(self, target: Attributed) -> bool: for attrib in self.attributes: @@ -730,7 +746,8 @@ def member_of(self, target: Attributed) -> bool: return False return True - def apply(self, target: Attributed, ): + def apply(self, target: Attributed): + terms_to_remove: List[Tuple[str, Union[Attribute, List[Attribute]]]] = [] for key in self.attributes.keys(): if target.has_attribute(key): diff --git a/implementations/python/mzlib/backends/base.py b/implementations/python/mzlib/backends/base.py index 5113b79..00d9669 100644 --- a/implementations/python/mzlib/backends/base.py +++ b/implementations/python/mzlib/backends/base.py @@ -56,7 +56,11 @@ def __new__(mcs, name, parents, attrs): file_extension = attrs.get("file_format") if file_extension is not None: - new_type._file_extension_to_implementation[file_extension] = new_type + if isinstance(file_extension, list): + for ext in file_extension: + new_type._file_extension_to_implementation[ext] = new_type + else: + new_type._file_extension_to_implementation[file_extension] = new_type format_name = attrs.get("format_name") if format_name is not None: @@ -126,6 +130,8 @@ def guess_from_filename(cls, filename: Union[str, Path, io.FileIO]) -> bool: return False if filename.endswith(".gz"): filename = filename[:-3] + if isinstance(cls.file_format, list): + return any(filename.endswith(ext) for ext in cls.file_format) return filename.endswith(cls.file_format) @classmethod @@ -334,7 +340,7 @@ def __getitem__(self, i) -> Union[Spectrum, List[Spectrum]]: @classmethod def has_index_preference(cls, filename: Union[str, Path, io.FileIO]) -> Type[IndexBase]: """ - Does this backend prefer a particular index for this file? + Check if this backend prefers a particular index for this file. The base implementation checks to see if there is a SQL index for the filename provided, and if so, prefers :class:`~.SQLIndex`. @@ -542,6 +548,8 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None) - offset = self.index.offset_for(spectrum_number) elif spectrum_name is not None: offset = self.index.offset_for(spectrum_name) + else: + raise ValueError("Must provide either spectrum_number or spectrum_name argument") buffer = self._get_lines_for(offset) spectrum = self._parse_from_buffer(buffer, spectrum_number) return spectrum @@ -670,6 +678,8 @@ def close(self): class LibraryIterator(AttributedEntity, _LibraryViewMixin, Iterator[Spectrum]): + """An iterator wrapper for a library source that doesn't permit random access""" + backend: SpectralLibraryBackendBase attributes: Attributed iter: Iterator[Spectrum] diff --git a/implementations/python/mzlib/backends/bibliospec.py b/implementations/python/mzlib/backends/bibliospec.py index 88c58ed..78b1f10 100644 --- a/implementations/python/mzlib/backends/bibliospec.py +++ b/implementations/python/mzlib/backends/bibliospec.py @@ -134,7 +134,10 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None): """ if spectrum_number is None: raise ValueError("Only spectrum number queries are supported. spectrum_number must have an integer value") - + try: + spectrum_number = int(spectrum_number) + except (ValueError, TypeError): + raise ValueError(f"spectrum_number must have an integer value, received {spectrum_number!r}") from None info = self.connection.execute("SELECT * FROM RefSpectra WHERE id = ?", (spectrum_number, )).fetchone() spectrum = self._new_spectrum() spectrum.key = info['id'] diff --git a/implementations/python/mzlib/backends/encyclopedia.py b/implementations/python/mzlib/backends/encyclopedia.py index d05ba4b..4cd7abb 100644 --- a/implementations/python/mzlib/backends/encyclopedia.py +++ b/implementations/python/mzlib/backends/encyclopedia.py @@ -131,6 +131,10 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None): """ if spectrum_number is None: raise ValueError("Only spectrum number queries are supported. spectrum_number must have an integer value") + try: + spectrum_number = int(spectrum_number) + except (TypeError, ValueError): + raise ValueError(f"spectrum_number must have an integer value, received {spectrum_number!r}") from None info = self.connection.execute("SELECT rowid, * FROM entries WHERE rowid = ?;", (spectrum_number, )).fetchone() spectrum = self._new_spectrum() diff --git a/implementations/python/mzlib/backends/json.py b/implementations/python/mzlib/backends/json.py index cd4d164..22df2fa 100644 --- a/implementations/python/mzlib/backends/json.py +++ b/implementations/python/mzlib/backends/json.py @@ -9,7 +9,7 @@ from mzlib.cluster import SpectrumCluster from mzlib.index import MemoryIndex -from mzlib.attributes import AttributeManager, Attributed +from mzlib.attributes import AttributeManager, Attributed, AttributeSet from mzlib.annotation import parse_annotation, IonAnnotationBase from mzlib.analyte import Analyte, Interpretation, FIRST_INTERPRETATION_KEY from mzlib.spectrum import Spectrum @@ -46,7 +46,21 @@ class JSONSpectralLibrary(SpectralLibraryBackendBase): - file_format = "mzlb.json" + """ + A reader for the JSON serialization of the mzSpecLib spectral library foramt. + + .. note:: + + Unlike other formats readers, this type does not parse incrementally, it instead + parses the entire JSON document in-memory and stores the parsed object structure. + The JSON objects are then converted into :mod:`mzlib` types upon request. This is + because incremental JSON parsing is substantially more difficult to do in a byte + aware manner, not to mention slow, in Python. + + This may lead to large memory overhead when reading large libraries in JSON format. + """ + + file_format = ["mzlb.json", "mzlib.json"] format_name = "json" def __init__(self, filename, index_type=None, read_metadata=True): @@ -79,9 +93,17 @@ def _load_buffer(self, filename_or_stream: Union[str, Path, io.FileIO, Mapping]) self.buffer = json.load(self.handle) self.handle.close() + def _load_attribute_sets(self, attribute_sets: dict): + return { + k: self._fill_attributes(v, AttributeSet(k, [])) for k, v in attribute_sets.items() + } + def read_header(self) -> bool: if self.buffer: self._fill_attributes(self.buffer.get(LIBRARY_METADATA_KEY), self.attributes) + self.analyte_attribute_sets.update(self._load_attribute_sets(self.buffer.get(ANALYTE_CLASSES, {}))) + self.spectrum_attribute_sets.update(self._load_attribute_sets(self.buffer.get(SPECTRUM_CLASSES, {}))) + self.interpretation_attribute_sets.update(self._load_attribute_sets(self.buffer.get(INTERPRETATION_CLASSES, {}))) return True return False @@ -135,6 +157,8 @@ def get_spectrum(self, spectrum_number: int=None, spectrum_name: str=None) -> Sp offset = self.index.offset_for(spectrum_number) elif spectrum_name is not None: offset = self.index.offset_for(spectrum_name) + else: + raise ValueError("Must provide either spectrum_number or spectrum_name argument") data = self.buffer[SPECTRA_KEY][offset] spectrum = self._make_spectrum_from_payload(data) return spectrum @@ -147,6 +171,8 @@ def get_cluster(self, cluster_number: int) -> SpectrumCluster: def _fill_attributes(self, attributes: List[Dict[str, Any]], store: Attributed, context_type: AttributeSetTypes=None) -> Attributed: + last_group_id = None + current_group_id = None for attrib in attributes: if attrib['accession'] == "MS:1003212": if context_type == AttributeSetTypes.analyte: @@ -165,13 +191,17 @@ def _fill_attributes(self, attributes: List[Dict[str, Any]], store: Attributed, value = f'{attrib["value_accession"]}|{attrib["value"]}' else: value = attrib['value'] - # TODO: When applying an attribute set with a group in it, we - # may collide with an existing (hard-coded) group identifier. - # This behavior probably exists in the text format too. + group = attrib.get("cv_param_group") - store.add_attribute(key, value, group_identifier=group) if group is not None: - store.group_counter = int(group) + if group != last_group_id: + current_group_id = store.get_next_group_identifier() + last_group_id = group + group = current_group_id + else: + group = current_group_id + + store.add_attribute(key, value, group_identifier=group) return store def _make_analyte_from_payload(self, analyte_id, analyte_d: Dict) -> Analyte: @@ -269,6 +299,18 @@ def read(self): class JSONSpectralLibraryWriter(SpectralLibraryWriterBase): + """ + Write a spectral library to the JSON serialization of the mzSpecLib spectral library foramt. + + .. note:: + + Unlike other format writers, this writer buffers the entire library in memory as JSON-compatible + Python objects until the entire library is ready to be written out. This is because incrementally + writing JSON is substantially more difficult to do correctly. + + This may lead to large memory overhead when writing large libraries in JSON format. + """ + file_format = "mzlb.json" format_name = "json" default_version = '1.0' diff --git a/implementations/python/mzlib/backends/msp.py b/implementations/python/mzlib/backends/msp.py index 7eeef08..68cbe00 100644 --- a/implementations/python/mzlib/backends/msp.py +++ b/implementations/python/mzlib/backends/msp.py @@ -803,6 +803,7 @@ class _UnknownTermTracker: counts: DefaultDict def add(self, key: str, value: Optional[str]=None): + """Add an unknown attribute to the tracker""" raise NotImplementedError() def items(self): @@ -810,6 +811,12 @@ def items(self): class UnknownKeyValueTracker(_UnknownTermTracker): + """ + A diagnostic tool for tracking attributes with values that the parser doesn't know how to interpret. + + This tracker holds both keys and values, and can grow quite large. For debugging purposes only. + """ + def __init__(self) -> None: self.counts = DefaultDict(lambda: DefaultDict(int)) @@ -818,6 +825,8 @@ def add(self, key: str, value: Optional[str]=None): class UnknownKeyTracker(_UnknownTermTracker): + """A diagnostic tool for tracking attributes that the parser doesn't know how to interpret.""" + def __init__(self) -> None: self.counts = DefaultDict(int) @@ -836,6 +845,28 @@ def add(self, key: str, value: Optional[str] = None): class MSPSpectralLibrary(_PlainTextSpectralLibraryBackendBase): + """ + A reader for the plain text NIST MSP spectral library format. + + The MSP format is only roughly defined, and does places few + constraints on the meanings of spectrum attributes. This parser + attempts to cover a variety of different ways that MSPs found + "in the wild" have denoted different spectrum properties, but + is neither exhaustive nor nuanced enough to know from context + exactly what those files' authors intended, making a best guess + at when they correspond to in the controlled vocabulary mapping + for :mod:`mzlib` + + + Attributes + ---------- + modification_parser : :class:`ModificationParser` + A parser for peptide modifications + unknown_attributes : :class:`_UnknownTermTracker` + A tracker for unknown attributes. Used to tell how much information + the reader is unable to map onto the controlled vocabulary. + """ + file_format = "msp" format_name = "msp" @@ -1192,6 +1223,35 @@ def _parse_comment(self, value: str, attributes: Attributed): attributes[item] = None def _make_attribute_handlers(self): + """ + Create the attribute handling scopes that map this flavor of MSP's + attributes onto controlled vocabulary terms in context. + + This method should be overridden in sub-classes to allow them + to change the meanings of attributes, add new ones, or otherwise + redirect how they are interpreted. + + See the :class:`AttributeHandler` type tree for more details about + how the distributed predicates are resolved. + + Returns + ------- + other_manager : :class:`AttributeHandler` + The attribute handler for uncategorized attributes that will be added + to a :class:`Spectrum`. + analyte_manager : :class:`AttributeHandler` + The attribute handler for attributes that will be added to a :class:`Analyte` + interpretation_manager : :class:`AttributeHandler` + The attribute handler for attributes that will be added to a :class:`Interpretation` + interpretation_member_manager : :class:`AttributeHandler` + The attribute handler for attributes that will be added to a :class:`InterpretationMember` + spectrum_manager : :class:`AttributeHandler` + The attribute handler for attributes that will be added to a :class:`Spectrum` + analyte_fallback_manager : :class:`AttributeHandler` + The attribute handler for attributes that will be tried for any attribute + that fails to be categorized by all of the other managers to be added to the + :class:`Analyte` before labeling the attribute as "unknown". + """ other_manager = MappingAttributeHandler(other_terms) analyte_manager = MappingAttributeHandler(analyte_terms) interpretation_manager = MappingAttributeHandler(interpretation_terms) @@ -1414,6 +1474,8 @@ def get_spectrum(self, spectrum_number: int=None, spectrum_name: str=None) -> Sp index_record = self.index.record_for(spectrum_name) spectrum_number = index_record.number offset = index_record.offset + else: + raise ValueError("Must provide either spectrum_number or spectrum_name argument") buffer = self._get_lines_for(offset) spectrum = self._parse(buffer, index_record.index) return spectrum diff --git a/implementations/python/mzlib/backends/text.py b/implementations/python/mzlib/backends/text.py index 959fde5..9dd9b0d 100644 --- a/implementations/python/mzlib/backends/text.py +++ b/implementations/python/mzlib/backends/text.py @@ -6,13 +6,14 @@ import enum import numbers +from dataclasses import dataclass from collections import deque from typing import ClassVar, List, Optional, Tuple, Union, Iterable from mzlib.annotation import parse_annotation from mzlib.spectrum import Spectrum from mzlib.cluster import SpectrumCluster -from mzlib.attributes import AttributeManager, Attributed, AttributeSet +from mzlib.attributes import Attribute, AttributeManager, Attributed, AttributeSet from mzlib.analyte import Analyte, Interpretation, InterpretationMember from mzlib.validate.object_rule import ValidationWarning @@ -82,7 +83,23 @@ class _LibraryParserStateEnum(enum.Enum): } -class _EntryParser: +class _Scope: + state: _SpectrumParserStateEnum + attribute_group: Optional[str] + working_attribute_group: Optional[str] + + def __init__(self, state: _SpectrumParserStateEnum, attribute_group: Optional[str] = None, working_attribute_group: Optional[str] = None) -> None: + if working_attribute_group is None: + working_attribute_group = attribute_group + self.state = state + self.attribute_group = attribute_group + self.working_attribute_group = working_attribute_group + + def __repr__(self): + return f"{self.__class__.__name__}({self.state}, {self.attribute_group}, {self.working_attribute_group})" + + +class _EntryParser(_Scope): """ Moves the complexity and state management involved in parsing a full entry out of :class:`TextSpectrumLibrary`, allowing it @@ -92,12 +109,13 @@ class _EntryParser: library: 'TextSpectralLibrary' state: _SpectrumParserStateEnum + spectrum: Optional[Spectrum] cluster: Optional[SpectrumCluster] analyte: Optional[Analyte] interpretation: Optional[Interpretation] interpretation_member: Optional[InterpretationMember] - + attribute_group: Optional[str] aggregation_types: List[str] peak_list: List[Tuple] @@ -105,10 +123,11 @@ class _EntryParser: line_number: int = -1 def __init__(self, library, start_line_number: int, spectrum_index: Optional[int]) -> None: + super().__init__(_SpectrumParserStateEnum.header, None) + self.library = library self.start_line_number = start_line_number or 0 self.spectrum_index = spectrum_index - self.state = _SpectrumParserStateEnum.header self.aggregation_types = None self.peak_list = [] @@ -119,6 +138,9 @@ def __init__(self, library, start_line_number: int, spectrum_index: Optional[int self.interpretation = None self.interpretation_member = None + def _parse_attribute_into(self, line: str, store: Attributed, line_number_message: str): + self.library._parse_attribute_into(line, store, line_number_message, self) + def real_line_number_or_nothing(self): message = f" on line {self.line_number + self.start_line_number}" if self.spectrum_index is not None: @@ -163,8 +185,8 @@ def _parse_header(self, line): self.cluster.key = int(match.group(1)) or self.cluster.index - 1 return - self.library._parse_attribute_into( - line, self.spectrum, self.real_line_number_or_nothing, self.state) + self._parse_attribute_into( + line, self.spectrum, self.real_line_number_or_nothing) def _parse_interpretation(self, line): if START_OF_ANALYTE_MARKER.match(line): @@ -200,7 +222,7 @@ def _parse_interpretation(self, line): self.interpretation.add_member_interpretation(self.interpretation_member) return - self.library._parse_attribute_into( + self._parse_attribute_into( line, self.interpretation.attributes, self.real_line_number_or_nothing) self.library._analyte_interpretation_link(self.spectrum, self.interpretation) @@ -228,7 +250,7 @@ def _parse_interpretation_member(self, line): self.interpretation.add_member_interpretation(self.interpretation_member) return - self.library._parse_attribute_into( + self._parse_attribute_into( line, self.interpretation_member, self.real_line_number_or_nothing) def _parse_analyte(self, line): @@ -264,7 +286,7 @@ def _parse_analyte(self, line): self.spectrum.add_interpretation(self.interpretation) return - self.library._parse_attribute_into(line, self.analyte, self.real_line_number_or_nothing) + self._parse_attribute_into(line, self.analyte, self.real_line_number_or_nothing) def _parse_peaks(self, line): # TODO: When we know more about how different aggregations are formatted, @@ -328,8 +350,8 @@ def _parse_cluster(self, line): raise ValueError( f"Clusters should not include interpretation member sections {self.real_line_number_or_nothing()}") - self.library._parse_attribute_into( - line, self.cluster, self.real_line_number_or_nothing, self.state) + self._parse_attribute_into( + line, self.cluster, self.real_line_number_or_nothing) def parse(self, buffer: Iterable[str]): line: str @@ -377,7 +399,15 @@ def _is_header_line(line: Union[str, bytes]) -> bool: class TextSpectralLibrary(_PlainTextSpectralLibraryBackendBase): - file_format: ClassVar[str] = "mzlb.txt" + """ + A reader for the plain text serialization of the mzSpecLib spectral library foramt. + + This implementation may operate on a stream opened in binary mode or a file path. + If using a non-seekable stream, the random access or search methods may not be + supported. + """ + + file_format: ClassVar[List[str]] = ["mzlb.txt", "mzlib.txt"] format_name: ClassVar[str] = "text" @classmethod @@ -608,7 +638,7 @@ def create_index(self) -> int: self.index.commit() n_spectra += 1 logger.info( - f"Processed {file_offset} bytes, {n_spectra} spectra read, {n_clusters} read") + f"Processed {file_offset} bytes, {n_spectra} spectra read, {n_clusters} clusters read") elif entry_is_cluster: self.index.add_cluster( number=current_key, @@ -617,7 +647,7 @@ def create_index(self) -> int: self.index.commit() n_clusters += 1 logger.info( - f"Processed {file_offset} bytes, {n_spectra} spectra read, {n_clusters} read") + f"Processed {file_offset} bytes, {n_spectra} spectra read, {n_clusters} clusters read") #### Flush the index self.index.commit() @@ -648,47 +678,62 @@ def _prepare_attribute_dict(self, match): except KeyError: match['value'] = try_cast(value) - def _parse_attribute_into(self, line: str, store: Attributed, - line_number_message=lambda:'', - state: _SpectrumParserStateEnum=None) -> bool: + def _parse_attribute(self, line: str, line_number_message=lambda: '', scope: Optional[_Scope]=None) -> Union[Attribute, AttributeSet]: match = key_value_term_pattern.match(line) + if scope is None: + scope = _Scope(None, None) if match is not None: d = match.groupdict() self._prepare_attribute_dict(d) if d['term'] == ATTRIBUTE_SET_NAME: - if _SpectrumParserStateEnum.header == state: + if _SpectrumParserStateEnum.header == scope.state: attr_set = self.spectrum_attribute_sets[d['value']] - elif _SpectrumParserStateEnum.analyte == state: + elif _SpectrumParserStateEnum.analyte == scope.state: attr_set = self.analyte_attribute_sets[d['value']] - elif _SpectrumParserStateEnum.interpretation == state: + elif _SpectrumParserStateEnum.interpretation == scope.state: attr_set = self.interpretation_attribute_sets[d['value']] - elif _SpectrumParserStateEnum.cluster == state: + elif _SpectrumParserStateEnum.cluster == scope.state: attr_set = self.cluster_attribute_sets[d['value']] else: - raise ValueError(f"Cannot define attribute sets for {state}") - attr_set.apply(store) - else: - store.add_attribute(d['term'], try_cast(d['value'])) - return True - if line.startswith("["): + raise ValueError(f"Cannot define attribute sets for {scope.state}") + return attr_set + attr = Attribute(d["term"], try_cast(d["value"])) + return attr + elif line.startswith("["): match = grouped_key_value_term_pattern.match(line) if match is not None: d = match.groupdict() self._prepare_attribute_dict(d) - store.add_attribute( - d['term'], try_cast(d['value']), d['group_id']) - store.group_counter = int(d['group_id']) - return True + attr = Attribute(d['term'], try_cast(d['value']), d['group_id']) + return attr else: - raise ValueError( - f"Malformed grouped attribute {line}{line_number_message()}") + raise ValueError(f"Malformed grouped attribute {line}{line_number_message()}") elif "=" in line: name, value = line.split("=", 1) - store.add_attribute(name, try_cast(value)) - return True + attr = Attribute(name, try_cast(value)) + return attr else: raise ValueError(f"Malformed attribute line {line}{line_number_message()}") + def _parse_attribute_into(self, line: str, store: Attributed, + line_number_message=lambda:'', + scope: Optional[_Scope]=None) -> bool: + if scope is None: + scope = _Scope(None, None) + attr = self._parse_attribute(line, line_number_message, scope) + if isinstance(attr, AttributeSet): + attr.apply(store) + else: + if attr.group_id: + if attr.group_id != scope.attribute_group: + scope.attribute_group = attr.group_id + scope.working_attribute_group = store.get_next_group_identifier() + attr.group_id = scope.working_attribute_group + else: + attr.group_id = scope.working_attribute_group + store.add_attribute(attr.key, attr.value, attr.group_id) + return True + def _parse(self, buffer: Iterable[str], spectrum_index: int = None, start_line_number: int=None) -> Union[Spectrum, SpectrumCluster]: parser = _EntryParser(self, start_line_number, spectrum_index) @@ -708,6 +753,8 @@ def get_spectrum(self, spectrum_number: int=None, index_record = self.index.record_for(spectrum_name) offset = index_record.offset spectrum_number = index_record.number + else: + raise ValueError("Must provide either spectrum_number or spectrum_name argument") buffer = self._get_lines_for(offset) spectrum = self._parse(buffer, index_record.index) @@ -721,6 +768,20 @@ def get_cluster(self, cluster_number: int) -> SpectrumCluster: class TextSpectralLibraryWriter(SpectralLibraryWriterBase): + """ + Write a spectral library to the plain text serialization of the mzSpecLib spectral library format. + + Attributes + ---------- + version : str + The format version to write in semver-compatible notation + compact_interpretation : bool, default :const:`True` + Whether to elect to write compact interpretation member sections when there is only + one interpretation and only one interpretation member by inlining the interpretation + member attributes into the interpretation. Both forms are valid, one is just less + verbose. + """ + file_format = "mzlb.txt" format_name = "text" default_version = '1.0' @@ -754,9 +815,7 @@ def write_header(self, library: SpectralLibraryBackendBase): else: version = self.version self.handle.write("\n" % (version, )) - self._write_attributes( - self._filter_attributes(library.attributes, lambda x: x.key != FORMAT_VERSION_TERM) - ) + self._write_attributes(library.attributes) for attr_set in library.spectrum_attribute_sets.values(): self.write_attribute_set(attr_set, AttributeSetTypes.spectrum) diff --git a/implementations/python/mzlib/backends/utils.py b/implementations/python/mzlib/backends/utils.py index b65d29b..0985e4c 100644 --- a/implementations/python/mzlib/backends/utils.py +++ b/implementations/python/mzlib/backends/utils.py @@ -80,6 +80,10 @@ def __getattr__(self, attr): def try_cast(value: Any) -> Union[str, int, float, Any]: + """ + Given a value, if it is a string, attempt to convert it to a numeric type, + or else return it as is. + """ if value is None: return value if not isinstance(value, str): @@ -122,7 +126,8 @@ def test_gzipped(f) -> bool: def starts_with_gz_magic(bytestring): - '''Tests whether or not a byte string starts with + """ + Test whether or not a byte string starts with the GZIP magic bytes. Parameters @@ -133,22 +138,21 @@ def starts_with_gz_magic(bytestring): Returns ------- bool - ''' + """ return bytestring.startswith(GZIP_MAGIC) def open_stream(f: Union[io.IOBase, os.PathLike], mode='rt', buffer_size: Optional[int]=None, encoding: Optional[str]='utf8', newline=None): - '''Select the file reading type for the given path or stream. + """ + Select the file reading type for the given path or stream. Detects whether the file is gzip encoded. - ''' + """ if buffer_size is None: buffer_size = DEFAULT_BUFFER_SIZE if 'r' in mode: if not hasattr(f, 'read'): f = io.open(f, 'rb') - # On Py2, dill doesn't behave correctly with io-derived objects, so we have to - # patch it below. Don't try to wrap an io.TextIOWrapper on Py3. if not isinstance(f, io.BufferedReader) and not isinstance(f, io.TextIOWrapper): buffered_reader = io.BufferedReader(f, buffer_size) else: diff --git a/implementations/python/mzlib/tools/cli.py b/implementations/python/mzlib/tools/cli.py index dee3771..7c3abcd 100644 --- a/implementations/python/mzlib/tools/cli.py +++ b/implementations/python/mzlib/tools/cli.py @@ -65,7 +65,7 @@ def main(): type=click.Choice(sorted(SpectralLibraryBackendBase._file_extension_to_implementation)), help='The file format of the input file. If omitted, will attempt to infer automatically.') def describe(path, diagnostics=False, input_format=None): - """Produces a minimal textual description of a spectral library.""" + """Produce a minimal textual description of a spectral library.""" click.echo("Describing \"%s\"" % (path,)) if SQLIndex.exists(path): index_type = SQLIndex