Source code for mammos_entity._read_files

from __future__ import annotations

import csv
import os
import re
from collections.abc import Mapping
from pathlib import Path
from typing import TYPE_CHECKING

import h5py
import mammos_units as u
import pandas as pd
import yaml

from mammos_entity._entity import Entity
from mammos_entity._entity_collection import EntityCollection

if TYPE_CHECKING:
    import mammos_units
    import numpy

    import mammos_entity


[docs] def from_csv(filename: str | Path) -> mammos_entity.EntityCollection: """Read MaMMoS CSV file. The required file format is described in :py:func:`~mammos_entity.EntityCollection.to_csv`. Args: filename: Name of the file to read. The file is read as CSV no matter the file extension. Returns: A collection object providing access to all entities saved in the file. .. seealso:: :py:func:`mammos_entity.EntityCollection.to_csv` """ with open(filename, newline="") as csvfile: file_version_information = csvfile.readline() version = re.search(r"v\d+", file_version_information) if version is None: raise RuntimeError( f"Cannot read version information from file {filename}. " f"Content of the first line: '{file_version_information}'" ) if version.group() not in [f"v{i}" for i in range(1, 4)]: raise RuntimeError( f"Reading mammos csv {version.group()} is not supported." ) version_number = int(version.group().lstrip("v")) collection_description = [] # read description position = csvfile.tell() if csvfile.readline().startswith("#--"): while True: line = csvfile.readline() if line == "": raise RuntimeError( "CSV description block is not terminated by a closing dashed " "line." ) if line.startswith("#--"): break else: collection_description.append( line.removeprefix("# ").rstrip("\r\n") ) else: # reset the file position csvfile.seek(position) # read ontology metadata if version_number >= 3: reader = csv.reader( csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL, lineterminator=os.linesep, ) try: ontology_labels = next(reader) descriptions = next(reader) next(reader) # ignore IRIs units = next(reader) except StopIteration as exc: raise RuntimeError( "CSV metadata is incomplete. Expected four metadata rows before " "the data table." ) from exc else: metadata_rows = [csvfile.readline() for _ in range(3)] if any(row == "" for row in metadata_rows): raise RuntimeError( "CSV metadata is incomplete. Expected three metadata rows before " "the data table." ) ontology_labels = metadata_rows[0].strip().removeprefix("#").split(",") # ignore IRIs: metadata_rows[1] units = metadata_rows[2].strip().removeprefix("#").split(",") descriptions = [""] * len(ontology_labels) try: data = pd.read_csv(csvfile) except pd.errors.EmptyDataError as exc: raise RuntimeError("CSV data table is empty.") from exc names = data.keys() scalar_data = len(data) == 1 try: columns = list(zip(names, ontology_labels, descriptions, units, strict=True)) except ValueError as exc: raise RuntimeError( "CSV metadata columns and data columns do not match." ) from exc collection = EntityCollection(description="\n".join(collection_description)) for name, ontology_label, description, unit in columns: data_values = data[name].values if not scalar_data else data[name].values[0] if ontology_label: entity = Entity( ontology_label=ontology_label, value=data_values, unit=unit, description=description, ) collection[name] = entity elif unit: collection[name] = u.Quantity(data_values, unit) else: collection[name] = data_values return collection
[docs] def from_yaml(filename: str | Path) -> mammos_entity.EntityCollection: """Read MaMMoS YAML file. The required file format is described in :py:func:`~mammos_entity.EntityCollection.to_yaml`. Args: filename: Name of the file to read. The file is read as YAML no matter the file extension. Returns: A collection object providing access to all entities saved in the file. .. seealso:: :py:func:`mammos_entity.EntityCollection.to_yaml` """ with open(filename) as f: first_line = f.readline().strip() if first_line == "# mammos yaml v2": return _from_yaml_v2(filename) return _from_yaml_v1(filename)
def _from_yaml_v1(filename: str | Path) -> mammos_entity.EntityCollection: """Read MaMMoS YAML file v1.""" with open(filename) as f: file_content = yaml.safe_load(f) if not isinstance(file_content, Mapping): raise RuntimeError("mammos yaml v1 files must contain a top-level mapping.") if set(file_content.keys()) != {"metadata", "data"}: raise RuntimeError( "mammos yaml v1 files must have exactly two top-level keys, " "'metadata' and 'data'." ) if not ( "metadata" in file_content and isinstance(file_content["metadata"], Mapping) and "version" in file_content["metadata"] and file_content["metadata"]["version"] == "v1" ): raise RuntimeError( "Wrong mammos yaml v1 syntax. Expected 'metadata' key with " "the 'version' equal to 'v1'." ) collection_description = file_content["metadata"].get("description") or "" if not isinstance(file_content.get("data"), Mapping): raise RuntimeError("'data' must be a mapping.") if not file_content["data"]: raise RuntimeError("'data' does not contain anything.") collection = EntityCollection(description=collection_description) for key, item in file_content["data"].items(): collection[key] = _parse_yaml_leaf_v1(item, key) return collection def _from_yaml_v2(filename: str | Path) -> mammos_entity.EntityCollection: """Read MaMMoS YAML file v2.""" with open(filename) as f: file_content = yaml.safe_load(f) if not isinstance(file_content, Mapping): raise RuntimeError("mammos yaml v2 files must contain a top-level mapping.") if set(file_content.keys()) != {"metadata", "description", "data"}: raise RuntimeError( "mammos yaml v2 files must have exactly three top-level keys, " "'metadata', 'description' and 'data'." ) root = { "description": file_content["description"], "data": file_content["data"], } return _parse_yaml_collection_v2(root, "") def _parse_yaml_leaf_v1(item: Mapping, key: str): if not isinstance(item, Mapping): raise RuntimeError( f"Element '{key}' must be a mapping, found {type(item).__name__}." ) keys = set(item) v1_keys = {"ontology_label", "ontology_iri", "unit", "value"} if keys != v1_keys: raise RuntimeError( f"Element '{key}' has invalid keys: {sorted(keys)}." f" Expected {sorted(v1_keys)}." ) if item["ontology_label"] is not None: entity = Entity( ontology_label=item["ontology_label"], value=item["value"], unit=item["unit"], ) return entity elif item["unit"] is not None: return u.Quantity(item["value"], item["unit"]) else: return item["value"] def _parse_yaml_leaf_v2(item: Mapping, key: str): key_display = key or "top-level collection" if not isinstance(item, Mapping): raise RuntimeError( f'Entry "{key_display}" is an invalid entity-like in mammos yaml v2: ' f"expected a mapping, found {type(item).__name__}." ) keys = set(item) entity_keys = {"ontology_label", "description", "ontology_iri", "unit", "value"} quantity_keys = {"unit", "value"} value_keys = {"value"} if keys == entity_keys: if not isinstance(item["ontology_label"], str): raise RuntimeError( f'Entry "{key_display}" is an invalid entity-like in mammos yaml v2: ' f'key "ontology_label" must be a string, found ' f"{type(item['ontology_label']).__name__}." ) if not isinstance(item["description"], str): raise RuntimeError( f'Entry "{key_display}" is an invalid entity-like in mammos yaml v2: ' f'key "description" must be a string, found ' f"{type(item['description']).__name__}." ) if not isinstance(item["ontology_iri"], str): raise RuntimeError( f'Entry "{key_display}" is an invalid entity-like in mammos yaml v2: ' f'key "ontology_iri" must be a string, found ' f"{type(item['ontology_iri']).__name__}." ) entity = Entity( ontology_label=item["ontology_label"], value=item["value"], unit=item["unit"], description=item["description"], ) return entity elif keys == quantity_keys: return u.Quantity(item["value"], item["unit"]) elif keys == value_keys: return item["value"] else: expected = [sorted(entity_keys), sorted(quantity_keys), sorted(value_keys)] raise RuntimeError( f'Entry "{key_display}" is an invalid entity-like in mammos yaml v2: ' f"invalid keys {sorted(keys)}; expected one of {expected}." ) def _parse_yaml_collection_v2(node: Mapping, key: str) -> EntityCollection: key_display = key or "top-level collection" if set(node.keys()) != {"description", "data"}: raise RuntimeError( f'Entry "{key_display}" is an invalid collection in mammos yaml v2: ' f"invalid keys {sorted(node.keys())}; expected ['data', 'description']." ) description = node["description"] if not isinstance(description, str): raise RuntimeError( f'Entry "{key_display}" is an invalid collection in mammos yaml v2: ' f'key "description" must be a string, found {type(description).__name__}.' ) if not isinstance(node["data"], Mapping): raise RuntimeError( f'Entry "{key_display}" is an invalid collection in mammos yaml v2: ' f'key "data" must be a mapping, found {type(node["data"]).__name__}.' ) if key == "" and not node["data"]: raise RuntimeError( f'Entry "{key_display}" is an invalid collection in mammos yaml v2: ' 'key "data" does not contain anything.' ) collection = EntityCollection(description=description) for name, item in node["data"].items(): child_path = _join_path(key, name) if isinstance(item, Mapping): item_keys = set(item.keys()) leaf_hint_keys = {"ontology_label", "ontology_iri", "unit", "value"} # Route ambiguous mappings with collection keys to collection parsing, # unless they clearly belong to an entity-like schema. if item_keys & leaf_hint_keys: collection[name] = _parse_yaml_leaf_v2(item, child_path) elif "description" in item_keys or "data" in item_keys: collection[name] = _parse_yaml_collection_v2(item, child_path) else: collection[name] = _parse_yaml_leaf_v2(item, child_path) else: collection[name] = _parse_yaml_leaf_v2(item, child_path) return collection def _join_path(parent_path: str, segment: str) -> str: # In principle there could be name clashes if we have two nested collections: # - outer["dotted.inner"].Ms # - outer["dotted"]["inner"].Ms # In practice this will rarely (never) happen so we ignore it. if parent_path: return f"{parent_path}.{segment}" return segment
[docs] def from_hdf5( element: h5py.File | h5py.Group | h5py.Dataset | str | os.PathLike, decode_bytes: bool = True, ) -> ( mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike | mammos_entity.EntityCollection ): """Read HDF5 file, group or dataset and convert to Entity or EntityCollection. Datasets are converted to :py:class:`~mammos_entity.Entity`, :py:class:`~mammos_units.Quantity`, or a numpy array or other builtin datatype depending on their associated metadata and shape. Groups are converted to :py:class:`~mammos_entity.EntityCollection`. Arbitrary nesting of groups is supported and produces nested collections. Args: element: If it is a `str` or `PathLike` the entire file is read from disk. If it is an open HDF5 `File`, `Group` or `Dataset` only that part of the file is read. decode_bytes: If ``True`` data of all datasets of type object is converted to strings (if scalar) or numpy arrays of strings (if vector). If ``False`` the bytes object (or array of bytes objects) is returned. Returns: All data in the given HDF5 file/group/dataset as (nested) EntityCollection and/or entity-like object. .. seealso:: :py:func:`mammos_entity.Entity.to_hdf5` :py:func:`mammos_entity.EntityCollection.to_hdf5` """ if isinstance(element, str | os.PathLike): with h5py.File(element) as f: return from_hdf5(f, decode_bytes) elif isinstance(element, h5py.File | h5py.Group): collection = EntityCollection(description=element.attrs.get("description", "")) for name, sub in element.items(): collection[name] = from_hdf5(sub) return collection elif "ontology_label" in element.attrs: return Entity( ontology_label=element.attrs["ontology_label"], value=element[()], unit=element.attrs["unit"], description=element.attrs["description"], ) elif "unit" in element.attrs: return u.Quantity(element[()], element.attrs["unit"]) else: if element.dtype == "object" and decode_bytes: element = element.asstr() data = element[()] return data