Source code for mammos_entity._entity_collection

"""EntityCollection class."""

from __future__ import annotations

import copy
import csv
import os
import textwrap
from typing import TYPE_CHECKING

import h5py
import mammos_units as u
import numpy as np
import pandas as pd
import yaml

import mammos_entity as me

if TYPE_CHECKING:
    import collections.abc

    import mammos_units
    import numpy.typing

    import mammos_entity
    import mammos_entity.typing


[docs] class EntityCollection: """Container class storing entity-like objects. An :py:class:`~mammos_entity.EntityCollection` groups entities together. It can store :py:class:`~mammos_entity.Entity`, :py:class:`~mammos_units.Quantity` and other objects (lists, tuples, arrays, etc.). We refer to all of these as `entity-like`. Common use cases are reading/writing files and conversion to and from :py:class:`pandas.DataFrame`. :py:class:`EntityCollection` provides access to entities via both attributes and a dictionary-like interface. Access via attribute is only possible if the entity name is a valid Python name and no property/method of EntityCollection shadows the entity. The dictionary interface does not have these limitations. Entities can have arbitrary string names, with the exception that ``description`` is not allowed. Entities passed as keyword arguments when creating the collection must have valid Python names. Examples: >>> import mammos_entity as me When creating a new collection entities can be passed as keyword arguments: >>> collection = me.EntityCollection("A description", Ms=me.Ms(), T=me.T()) >>> collection EntityCollection( description='A description', Ms=Entity(ontology_label='SpontaneousMagnetization', value=np.float64(0.0), unit='A / m'), T=Entity(ontology_label='ThermodynamicTemperature', value=np.float64(0.0), unit='K'), ) Entities in the collection can be accessed either via attribute or a dictionary-like interface: >>> collection.Ms Entity(ontology_label='SpontaneousMagnetization', value=np.float64(0.0), unit='A / m') >>> collection["T"] Entity(ontology_label='ThermodynamicTemperature', value=np.float64(0.0), unit='K') Additional elements can be added using both interfaces ("private" elements, i.e. entity names starting with an underscore can only be set/retrieved using the dictionary-like interface): >>> collection.A = [1, 2, 3] >>> collection["B"] = me.B([4, 5, 6]) Checking if an entity name exists in a collection can be done with: >>> "B" in collection True >>> "Js" in collection False Elements can be removed using: >>> del collection.T >>> del collection.B The collection is iterable, elements are tuples ``(name, entity-like)``: >>> list(collection) [('Ms', Entity(ontology_label='SpontaneousMagnetization', value=np.float64(0.0), unit='A / m')), ('A', [1, 2, 3])] """ # noqa: E501
[docs] def __init__( self, description: str = "", **kwargs: mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike, ): """Initialize EntityCollection, keywords become attributes of the class. Args: description: Information string to assign to ``description`` attribute. **kwargs : entities to be stored in the collection. """ self.description = description self._entities = kwargs
def __getitem__( self, key: str ) -> mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike: return self._entities[key] def __setitem__( self, key: str, value: mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike, ): if not isinstance(key, str): raise TypeError( f"Name must be a string, received {key!r} ({type(key).__name__})." ) if key == "description": raise KeyError("'description' is not allowed as entity name.") self._entities[key] = value def __delitem__(self, key: str) -> None: del self._entities[key] def __iter__( self, ) -> collections.abc.Iterator[ tuple[ str, mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike ] ]: yield from self._entities.items() def __len__(self) -> int: return len(self._entities) def __contains__(self, key: str) -> bool: return key in self._entities def __setattr__( self, name: str, value: mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike, ) -> None: """Add new elements to entities dictionary. Public name (no leading underscore) becomes part of the ``entities`` dictionary. Private names (at least one leading underscore) are added to the class normally. If a property/method with the same name exists it takes precedence and the entity will not be added to ``entities``. Instead, the property assignment is called/the method is overwritten. In such cases add the entity via the dict interface ``collection.entities["name"] = value``. """ if name.startswith("_") or hasattr(self.__class__, name): object.__setattr__(self, name, value) else: self[name] = value def __getattr__( self, name: str ) -> mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike: """Access entities via dot notation. Allow access to entities using ``collection.name`` as a short-hand for ``collection.entities["name"]``. If a property/method with the same name exists it gets precedence. In such cases access to the entity is only possible via the ``entities`` dictionary. """ try: return self[name] except KeyError: raise AttributeError(name) from None def __delattr__(self, name: str) -> None: """Delete element from collection. If an entity with ``name`` is in the collections internal dictionary (``entities``) it is removed from that dictionary. If a method with the same name exists, it gets precedence. In such cases delete from the ``entities`` dictionary directly by using ``del collection.entities[name]``. """ if name.startswith("_") or hasattr(self.__class__, name): object.__delattr__(self, name) elif name in self: del self[name] else: raise AttributeError( f"'{self.__class__.__name__}' object has no attribute '{name}'" ) def __dir__(self) -> list[str]: dir = super().__dir__() dir.extend(self._entities) return sorted(dir) def __copy__(self): """Shallow copy of entities.""" return self.__class__(description=self.description, **self._entities) def __deepcopy__(self, memo): """Deep copy of entities.""" entities = { name: copy.deepcopy(entity, memo) for name, entity in self._entities.items() } return self.__class__(description=self.description, **entities) @property def description(self) -> str: """Additional description of the entity collection. The description is a string containing any information relevant to the entity collection. This can include, e.g., whether it is a set of experimental or simulation quantities or outline the overall workflow. """ return self._description @description.setter def description(self, value) -> None: if isinstance(value, str): self._description = value else: raise ValueError( f"Description must be a string. " f"Received value: {value} of type: {type(value)}." ) def __repr__(self) -> str: """Show container elements.""" args = f"description={self.description!r},\n" args += "\n".join(f"{key}={val!r}," for key, val in self._entities.items()) return f"{self.__class__.__name__}(\n{textwrap.indent(args, ' ' * 4)}\n)"
[docs] def to_dataframe(self, include_units: bool = False) -> pd.DataFrame: """Convert values to dataframe. Args: include_units: If true, include units in the dataframe column names. """ if any(isinstance(element, EntityCollection) for _name, element in self): raise ValueError("Nested collections cannot be converted to dataframe.") def unit(key: str) -> str: """Get unit for element key. Returns: A string " (unit)" if the element has a unit, otherwise an empty string. """ unit = getattr(getattr(self, key), "unit", None) if unit and str(unit): return f" ({unit!s})" else: return "" return pd.DataFrame( { f"{key}{unit(key) if include_units else ''}": np.atleast_1d( getattr(val, "value", val) ) for key, val in self } )
[docs] def metadata(self) -> dict[str, str | dict[str, str]]: """Get entity metadata as dictionary. This method creates a dictionary containing metadata for all entities in the collection. Keys are names of the (entities) attributes of the collection, values are dictionaries with: - keys ``ontology_label``, ``unit`` and ``description`` if the attribute is an entity - key ``unit`` if the attribute is a quantity - an empty dictionary otherwise In addition there is one key-value pair ``description`` for the collection description. Example: >>> import mammos_entity as me >>> import mammos_units as u >>> col = me.EntityCollection("The description", Tc=me.Tc(), x=1 * u.m, a=0) >>> col.metadata() {'description': 'The description', 'Tc': {'ontology_label': 'CurieTemperature', 'unit': 'K', 'description': ''}, 'x': {'unit': 'm'}, 'a': {}} """ # noqa: E501 result = {"description": self.description} for name, entity_like in self._entities.items(): element = {} if isinstance(entity_like, me.Entity): element["ontology_label"] = entity_like.ontology_label element["unit"] = str(entity_like.unit) element["description"] = entity_like.description elif isinstance(entity_like, u.Quantity): element["unit"] = str(entity_like.unit) result[name] = element return result
[docs] @classmethod def from_dataframe( cls, dataframe: pd.DataFrame, metadata: dict[str, dict] ) -> mammos_entity.EntityCollection: """Create EntityCollection from dataframe and metadata. The EntityCollection is created by combining metadata with data from the dataframe matching key/column names. The available metadata determines whether an element becomes an :py:class:`~mammos_entity.Entity``, a :py:class:`mammos_units.Quantity` or a numpy array. All column names in the `dataframe` must also exist as keys in `metadata` and vice versa. In addition `metadata` can have a key ``description`` containing a description for the collection. Args: dataframe: A dataframe containing the values for the individual entities. metadata: A dictionary with the structure similar to the one defined in :py:func:`~EntityCollection.metadata`. The keys ``unit`` and ``description`` for an :py:class`~mammos_entity.Entity` are however optional. If not present, default units from the ontology and an empty description are used. """ metadata = copy.deepcopy(metadata) # do not modify the user's metadata dict description = metadata.pop("description", "") if missing_keys := set(dataframe.columns) - set(metadata): raise ValueError( f"Entity_Metadata is missing for columns: {', '.join(missing_keys)}" ) if missing_keys := set(metadata) - set(dataframe.columns): raise ValueError( f"Entity_Metadata is missing for columns: {', '.join(missing_keys)}" ) entities = {} for name in metadata: value = dataframe[name].to_numpy() if len(value) == 1: value = value[0] if "ontology_label" in metadata[name]: elem = me.Entity( ontology_label=metadata[name]["ontology_label"], value=value, unit=metadata[name].get("unit"), description=metadata[name].get("description", ""), ) elif "unit" in metadata[name]: elem = u.Quantity( value=value, unit=metadata[name]["unit"], ) else: elem = value entities[name] = elem return cls(description=description, **entities)
[docs] def to_csv(self, filename: str | os.PathLike) -> None: r"""Write collection to CSV file. CSV files contain data in normal CSV format and additional metadata lines at the top of the file. Some of the lines are commented with ``#``. This structure is fixed and additional comment lines or inline comments in the data table are not allowed. The lines are, in order: - (commented) the file version in the form ``mammos csv v<VERSION>`` (matching regex v\d+) - (commented, optional) a description of the file, appearing delimited by dashed lines - (optional, only for entities) the preferred ontology label - (optional, only for entities) a description string - (optional, only for entities) the ontology IRI - (optional, for entities and quantities) units - the short labels used to refer to individual columns when working with the data, e.g. in a :py:class:`pandas.DataFrame` (omitting spaces in this string is advisable; ideally this string is the short ontology label) - all remaining lines contain data. Elements in a line are separated by a comma without any surrounding whitespace. A trailing comma is not permitted. Line continuation is OS dependent (\r\n on Windows, \n on Unix). In columns without ontology the lines containing labels, IRIs, and description are empty. Similarly, columns without units (with or without ontology entry) have empty units line. For any column, the description line can be empty. Only entities can store descriptions, i.e., if the ontology-related lines are empty, the description string will not be read. .. version-added:: v2 The optional description of the file. .. version-added:: v3 Additional description metadata row containing a description for each column. .. version-changed:: v3 Ontology labels, entity descriptions, IRIs, and units are no longer commented. Args: filename: Name of the generated file. An existing file with the same name is overwritten without notice. Raises: ValueError: If the entities are not tabular. CSV files can only be written for collections in which all entities are either scalar or one-dimenisional with the same length. ValueError: If elements of the collection are of type :py:class:`~mammos_entity.EntityCollection` (nested collections are not supported in CSV) or if the collection is empty. Example: Here is an example with five columns: - an index with no units or ontology label - the entity spontaneous magnetization with an entry in the ontology and a description - a made-up quantity alpha with a unit but no ontology label - demagnetizing factor with an ontology entry but no unit - a column `comment` containing a string comment without units or ontology label The file has a description reading "Test data". >>> from pathlib import Path >>> import mammos_entity as me >>> import mammos_units as u >>> collection = me.EntityCollection( ... description="Test data", ... index=[0, 1, 2], ... Ms=me.Entity("SpontaneousMagnetization", [1e2, 1e2, 1e2], "kA/m", description="Magnetization at 0 Kelvin"), ... alpha=[1.2, 3.4, 5.6] * u.s**2, ... DemagnetizingFactor=me.Entity("DemagnetizingFactor", [1, 0.5, 0.5]), ... comment=[ ... "Comment in the first row", ... "Comment in the second row", ... "Comment in the third row", ... ], ... ) >>> collection.to_csv("example.csv") The new file has the following content: >>> print(Path("example.csv").read_text()) # mammos csv v3 #---------------------------------------- # Test data #---------------------------------------- ,SpontaneousMagnetization,,DemagnetizingFactor, ,Magnetization at 0 Kelvin,,, ,https://w3id.org/emmo/domain/magnetic-materials#EMMO_032731f8-874d-5efb-9c9d-6dafaa17ef25,,https://w3id.org/emmo/domain/magnetic-materials#EMMO_0f2b5cc9-d00a-5030-8448-99ba6b7dfd1e, ,kA / m,s2,, index,Ms,alpha,DemagnetizingFactor,comment 0,100.0,1.2,1.0,Comment in the first row 1,100.0,3.4,0.5,Comment in the second row 2,100.0,5.6,0.5,Comment in the third row <BLANKLINE> Finally, remove the file. >>> Path("example.csv").unlink() """ # noqa: E501 if any(isinstance(element, EntityCollection) for _name, element in self): raise ValueError("Nested collections cannot be saved to CSV.") if len(self) == 0: raise ValueError("Empty collections cannot be saved to CSV.") # convert data first because that will catch incompatible shape dataframe = self.to_dataframe() # Header rows written in CSV format. metadata_rows = [ [getattr(elem, "ontology_label", "") for _, elem in self], [getattr(elem, "description", "") for _, elem in self], [getattr(elem, "ontology_iri", "") for _, elem in self], [str(getattr(elem, "unit", "")) for _, elem in self], ] with open(filename, "w", newline="") as csvfile: csvfile.write(f"# mammos csv v3{os.linesep}") if self.description: csvfile.write("#" + "-" * 40 + os.linesep) for line in self.description.splitlines(): csvfile.write(f"# {line}{os.linesep}") csvfile.write("#" + "-" * 40 + os.linesep) writer = csv.writer( csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL, lineterminator=os.linesep, ) writer.writerows(metadata_rows) dataframe.to_csv(csvfile, index=False)
[docs] def to_yaml(self, filename: str | os.PathLike) -> None: r"""Write collection to YAML file. MaMMoS YAML files have the following format: - one commented line at the top of the file containing the mammos format version in the form `# mammos yaml v<version-number>`. - a mapping with three top-level keys ``metadata``, ``description`` and ``data`` - ``metadata`` is currently unused and should be empty - the ``description`` key contains a (multi-line) string with arbitrary content describing the top-level collection - ``data`` contains one key per element in the collection. Each entry is either an entity-like entry or a nested collection node. Collection nodes are recursive and have two keys ``description`` and ``data``: - ``description``: a (multi-line) string with arbitrary content - ``data``: mapping from entry names to entity-like entries or nested collection nodes Entity-like entries have the following keys: - For :py:class:`~mammos_entity.Entity`: - ``ontology_label``: label in the ontology - ``description``: description string - ``ontology_iri``: IRI of the entity - ``unit``: unit of the entity (``""`` for dimensionless) - ``value``: value of the data - For :py:class:`~mammos_units.Quantity`: - ``unit``: unit of the quantity - ``value``: value of the data - For any other value: - ``value``: value of the data .. version-added:: v2 The ``description`` key for each object. .. version-changed:: v2 - The version of the file is now stored in the first commented line, previously it was stored in ``metadata:description``. - The top-level collection description is stored under ``description`` (next to ``metadata`` and ``data``). Previously it was stored in ``metadata:description``. - Non-entity entries no longer store null-valued ontology keys. - Nested collections are supported recursively. Args: filename: Name of the generated file. An existing file with the same name is overwritten without notice. Raises: ValueError: If the top-level collection is empty. Example: Here is an example with six entries: - an index with no units or ontology label - the entity spontaneous magnetization with an entry in the ontology and a description - a made-up quantity alpha with a unit but no ontology label - demagnetizing factor with an ontology entry but no unit - a column `comment` containing a string comment without units or ontology label - an element Tc with only a single value The file has a description reading "Test data". >>> from pathlib import Path >>> import mammos_entity as me >>> import mammos_units as u >>> collection = me.EntityCollection( ... description="Test data", ... index=[0, 1, 2], ... Ms=me.Entity("SpontaneousMagnetization", [1e2, 1e2, 1e2], "kA/m", description="Magnetization at 0 Kelvin"), ... alpha=[1.2, 3.4, 5.6] * u.s**2, ... DemagnetizingFactor=me.Entity("DemagnetizingFactor", [1, 0.5, 0.5]), ... comment=[ ... "Comment in the first row", ... "Comment in the second row", ... "Comment in the third row", ... ], ... Tc=me.Tc(300, "K"), ... ) >>> collection.to_yaml("example.yaml") The new file has the following content: >>> print(Path("example.yaml").read_text()) # mammos yaml v2 metadata: null description: Test data data: index: value: [0, 1, 2] Ms: ontology_label: SpontaneousMagnetization description: Magnetization at 0 Kelvin ontology_iri: https://w3id.org/emmo/domain/magnetic-materials#EMMO_032731f8-874d-5efb-9c9d-6dafaa17ef25 unit: kA / m value: [100.0, 100.0, 100.0] alpha: unit: s2 value: [1.2, 3.4, 5.6] DemagnetizingFactor: ontology_label: DemagnetizingFactor description: '' ontology_iri: https://w3id.org/emmo/domain/magnetic-materials#EMMO_0f2b5cc9-d00a-5030-8448-99ba6b7dfd1e unit: '' value: [1.0, 0.5, 0.5] comment: value: [Comment in the first row, Comment in the second row, Comment in the third row] Tc: ontology_label: CurieTemperature description: '' ontology_iri: https://w3id.org/emmo#EMMO_6b5af5a8_a2d8_4353_a1d6_54c9f778343d unit: K value: 300.0 <BLANKLINE> >>> Path("example.yaml").unlink() Here is a second example with one outer and one inner collection: >>> properties = me.EntityCollection( ... description="material properties", ... Ms=me.Ms(1.3e3, "kA/m"), ... Tc=me.Tc(1043, "K"), ... ) >>> measurement = me.EntityCollection( ... description="measurement with device X", ... sample=properties, ... T=me.T(300, "K", description="Measurement conditions"), ... H=me.H([0, 50, 100], "kA/m"), ... M=me.M([100, 300, 500], "kA/m"), ... ) >>> measurement.to_yaml("nested_example.yaml") >>> print(Path("nested_example.yaml").read_text()) # mammos yaml v2 metadata: null description: measurement with device X data: sample: description: material properties data: Ms: ontology_label: SpontaneousMagnetization description: '' ontology_iri: https://w3id.org/emmo/domain/magnetic-materials#EMMO_032731f8-874d-5efb-9c9d-6dafaa17ef25 unit: kA / m value: 1300.0 Tc: ontology_label: CurieTemperature description: '' ontology_iri: https://w3id.org/emmo#EMMO_6b5af5a8_a2d8_4353_a1d6_54c9f778343d unit: K value: 1043.0 T: ontology_label: ThermodynamicTemperature description: Measurement conditions ontology_iri: https://w3id.org/emmo#EMMO_affe07e4_e9bc_4852_86c6_69e26182a17f unit: K value: 300.0 H: ontology_label: ExternalMagneticField description: '' ontology_iri: https://w3id.org/emmo/domain/magnetic-materials#EMMO_da08f0d3-fe19-58bc-8fb6-ecc8992d5eb3 unit: kA / m value: [0.0, 50.0, 100.0] M: ontology_label: Magnetization description: '' ontology_iri: https://w3id.org/emmo#EMMO_b23e7251_a488_4732_8268_027ad76d7e37 unit: kA / m value: [100.0, 300.0, 500.0] <BLANKLINE> >>> Path("nested_example.yaml").unlink() """ # noqa: E501 def _serialize_entity_like( element: mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike, ) -> dict: if isinstance(element, me.Entity): return { "ontology_label": element.ontology_label, "description": element.description, "ontology_iri": element.ontology_iri, "unit": str(element.unit), "value": element.value.tolist(), } elif isinstance(element, u.Quantity): return { "unit": str(element.unit), "value": element.value.tolist(), } else: return {"value": np.asanyarray(element).tolist()} if len(self) == 0: raise ValueError("Empty collections cannot be saved to YAML.") def _serialize_collection(collection: EntityCollection) -> dict: result = {"description": collection.description, "data": {}} for name, element in collection: if isinstance(element, EntityCollection): result["data"][name] = _serialize_collection(element) else: result["data"][name] = _serialize_entity_like(element) return result entity_dict = {"metadata": None, **_serialize_collection(self)} # custom dumper to change style of lists, tuples and multi-line strings class _Dumper(yaml.SafeDumper): pass def _represent_sequence(dumper, value): """Display sequence with flow style. A list [1, 2, 3] for key `value` is written to file as:: value: [1, 2, 3] instead of:: value: - 1 - 2 - 3 """ return dumper.represent_sequence( "tag:yaml.org,2002:seq", value, flow_style=True ) def _represent_string(dumper, value): """Control style of single-line and multi-line strings. Single-line strings are written as:: some_key: Hello Multi-line strings are written as:: some_key: |- I am multi-line, without a trailing new line. """ style = "|" if "\n" in value else "" return dumper.represent_scalar("tag:yaml.org,2002:str", value, style=style) _Dumper.add_representer(list, _represent_sequence) _Dumper.add_representer(tuple, _represent_sequence) _Dumper.add_representer(str, _represent_string) with open(filename, "w") as f: f.write("# mammos yaml v2\n") yaml.dump( entity_dict, stream=f, Dumper=_Dumper, default_flow_style=False, sort_keys=False, )
[docs] def to_hdf5( self, base: h5py.File | h5py.Group | str | os.PathLike, name: str | None = None ) -> h5py.Group | None: """Write a collection to an HDF5 group. Entities of the collection become datasets in the group. The collection description is added to the group attributes. Args: base: If it is an open HDF5 file or a group in an HDF5 file, data will be added to it as new group. If it is a str or PathLike a new HDF5 file with the given name will be created. If a file with that name exists already, it will be overwritten without notice. name: Name for the newly created group. If an element with that name exists already in `base` the function will fail. If `name` is ``None`` entities of the collection will be added directly to `base` and the collection description will be added to `base` attributes. Returns: If `base` is an open `File` or `Group` the newly created group. If `base` is a file name nothing is returned (because the file created internally will be closed before the function returns). """ return _to_hdf5(self, base, name)
def _to_hdf5( data: mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike | mammos_entity.EntityCollection, base: h5py.File | h5py.Group | str | os.PathLike, name: str | None, record_mammos_entity_version: bool = True, ) -> h5py.Dataset | h5py.Group | None: """Internal implementation with additional options required for recursion. Args: data: <see public method> base: <see public method> name: <see public method> record_mammos_entity_version: add mammos_entity version to group/dataset attributes. """ if isinstance(base, str | os.PathLike): with h5py.File(base, "w") as f: _to_hdf5(data, f, name) return if isinstance(data, EntityCollection): group = base.create_group(name, track_order=True) if name is not None else base group.attrs["description"] = data.description if record_mammos_entity_version: group.attrs["mammos_entity_version"] = me.__version__ for name, entity_like in data: _to_hdf5(entity_like, group, name, record_mammos_entity_version=False) return group else: if name is None: raise ValueError("'name' must not be None when 'data' is entity-like.") if isinstance(data, me.Entity): dset = data._to_hdf5(base, name, record_mammos_entity_version=False) elif isinstance(data, u.Quantity): dset = base.create_dataset(name, data=data.value) dset.attrs["unit"] = str(data.unit) else: dset = base.create_dataset(name, data=data) if record_mammos_entity_version: dset.attrs["mammos_entity_version"] = me.__version__ return dset