"""EntityCollection class."""
from __future__ import annotations
import copy
import csv
import os
import textwrap
from typing import TYPE_CHECKING
import h5py
import mammos_units as u
import numpy as np
import pandas as pd
import yaml
import mammos_entity as me
if TYPE_CHECKING:
import collections.abc
import mammos_units
import numpy.typing
import mammos_entity
import mammos_entity.typing
[docs]
class EntityCollection:
"""Container class storing entity-like objects.
An :py:class:`~mammos_entity.EntityCollection` groups entities together. It can
store :py:class:`~mammos_entity.Entity`, :py:class:`~mammos_units.Quantity` and
other objects (lists, tuples, arrays, etc.). We refer to all of these as
`entity-like`.
Common use cases are reading/writing files and conversion to and from
:py:class:`pandas.DataFrame`.
:py:class:`EntityCollection` provides access to entities via both attributes and a
dictionary-like interface. Access via attribute is only possible if the entity name
is a valid Python name and no property/method of EntityCollection shadows the
entity. The dictionary interface does not have these limitations.
Entities can have arbitrary string names, with the exception that
``description`` is not allowed. Entities passed as keyword arguments when creating
the collection must have valid Python names.
Examples:
>>> import mammos_entity as me
When creating a new collection entities can be passed as keyword arguments:
>>> collection = me.EntityCollection("A description", Ms=me.Ms(), T=me.T())
>>> collection
EntityCollection(
description='A description',
Ms=Entity(ontology_label='SpontaneousMagnetization', value=np.float64(0.0), unit='A / m'),
T=Entity(ontology_label='ThermodynamicTemperature', value=np.float64(0.0), unit='K'),
)
Entities in the collection can be accessed either via attribute or a
dictionary-like interface:
>>> collection.Ms
Entity(ontology_label='SpontaneousMagnetization', value=np.float64(0.0), unit='A / m')
>>> collection["T"]
Entity(ontology_label='ThermodynamicTemperature', value=np.float64(0.0), unit='K')
Additional elements can be added using both interfaces ("private" elements, i.e.
entity names starting with an underscore can only be set/retrieved using the
dictionary-like interface):
>>> collection.A = [1, 2, 3]
>>> collection["B"] = me.B([4, 5, 6])
Checking if an entity name exists in a collection can be done with:
>>> "B" in collection
True
>>> "Js" in collection
False
Elements can be removed using:
>>> del collection.T
>>> del collection.B
The collection is iterable, elements are tuples ``(name, entity-like)``:
>>> list(collection)
[('Ms', Entity(ontology_label='SpontaneousMagnetization', value=np.float64(0.0), unit='A / m')), ('A', [1, 2, 3])]
""" # noqa: E501
[docs]
def __init__(
self,
description: str = "",
**kwargs: mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike,
):
"""Initialize EntityCollection, keywords become attributes of the class.
Args:
description: Information string to assign to ``description`` attribute.
**kwargs : entities to be stored in the collection.
"""
self.description = description
self._entities = kwargs
def __getitem__(
self, key: str
) -> mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike:
return self._entities[key]
def __setitem__(
self,
key: str,
value: mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike,
):
if not isinstance(key, str):
raise TypeError(
f"Name must be a string, received {key!r} ({type(key).__name__})."
)
if key == "description":
raise KeyError("'description' is not allowed as entity name.")
self._entities[key] = value
def __delitem__(self, key: str) -> None:
del self._entities[key]
def __iter__(
self,
) -> collections.abc.Iterator[
tuple[
str, mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike
]
]:
yield from self._entities.items()
def __len__(self) -> int:
return len(self._entities)
def __contains__(self, key: str) -> bool:
return key in self._entities
def __setattr__(
self,
name: str,
value: mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike,
) -> None:
"""Add new elements to entities dictionary.
Public name (no leading underscore) becomes part of the ``entities`` dictionary.
Private names (at least one leading underscore) are added to the class normally.
If a property/method with the same name exists it takes precedence and the
entity will not be added to ``entities``. Instead, the property assignment is
called/the method is overwritten. In such cases add the entity via the dict
interface ``collection.entities["name"] = value``.
"""
if name.startswith("_") or hasattr(self.__class__, name):
object.__setattr__(self, name, value)
else:
self[name] = value
def __getattr__(
self, name: str
) -> mammos_entity.Entity | mammos_units.Quantity | numpy.typing.ArrayLike:
"""Access entities via dot notation.
Allow access to entities using ``collection.name`` as a short-hand for
``collection.entities["name"]``.
If a property/method with the same name exists it gets precedence. In such cases
access to the entity is only possible via the ``entities`` dictionary.
"""
try:
return self[name]
except KeyError:
raise AttributeError(name) from None
def __delattr__(self, name: str) -> None:
"""Delete element from collection.
If an entity with ``name`` is in the collections internal dictionary
(``entities``) it is removed from that dictionary. If a method with the same
name exists, it gets precedence. In such cases delete from the ``entities``
dictionary directly by using ``del collection.entities[name]``.
"""
if name.startswith("_") or hasattr(self.__class__, name):
object.__delattr__(self, name)
elif name in self:
del self[name]
else:
raise AttributeError(
f"'{self.__class__.__name__}' object has no attribute '{name}'"
)
def __dir__(self) -> list[str]:
dir = super().__dir__()
dir.extend(self._entities)
return sorted(dir)
def __copy__(self):
"""Shallow copy of entities."""
return self.__class__(description=self.description, **self._entities)
def __deepcopy__(self, memo):
"""Deep copy of entities."""
entities = {
name: copy.deepcopy(entity, memo) for name, entity in self._entities.items()
}
return self.__class__(description=self.description, **entities)
@property
def description(self) -> str:
"""Additional description of the entity collection.
The description is a string containing any information relevant to the entity
collection. This can include, e.g., whether it is a set of experimental
or simulation quantities or outline the overall workflow.
"""
return self._description
@description.setter
def description(self, value) -> None:
if isinstance(value, str):
self._description = value
else:
raise ValueError(
f"Description must be a string. "
f"Received value: {value} of type: {type(value)}."
)
def __repr__(self) -> str:
"""Show container elements."""
args = f"description={self.description!r},\n"
args += "\n".join(f"{key}={val!r}," for key, val in self._entities.items())
return f"{self.__class__.__name__}(\n{textwrap.indent(args, ' ' * 4)}\n)"
[docs]
def to_dataframe(self, include_units: bool = False) -> pd.DataFrame:
"""Convert values to dataframe.
Args:
include_units: If true, include units in the dataframe column names.
"""
if any(isinstance(element, EntityCollection) for _name, element in self):
raise ValueError("Nested collections cannot be converted to dataframe.")
def unit(key: str) -> str:
"""Get unit for element key.
Returns:
A string " (unit)" if the element has a unit, otherwise an empty string.
"""
unit = getattr(getattr(self, key), "unit", None)
if unit and str(unit):
return f" ({unit!s})"
else:
return ""
return pd.DataFrame(
{
f"{key}{unit(key) if include_units else ''}": np.atleast_1d(
getattr(val, "value", val)
)
for key, val in self
}
)
[docs]
@classmethod
def from_dataframe(
cls, dataframe: pd.DataFrame, metadata: dict[str, dict]
) -> mammos_entity.EntityCollection:
"""Create EntityCollection from dataframe and metadata.
The EntityCollection is created by combining metadata with data from the
dataframe matching key/column names. The available metadata determines whether
an element becomes an :py:class:`~mammos_entity.Entity``, a
:py:class:`mammos_units.Quantity` or a numpy array.
All column names in the `dataframe` must also exist as keys in `metadata` and
vice versa.
In addition `metadata` can have a key ``description`` containing a description
for the collection.
Args:
dataframe: A dataframe containing the values for the individual entities.
metadata: A dictionary with the structure similar to the one defined in
:py:func:`~EntityCollection.metadata`. The keys ``unit`` and
``description`` for an :py:class`~mammos_entity.Entity` are however
optional. If not present, default units from the ontology and an empty
description are used.
"""
metadata = copy.deepcopy(metadata) # do not modify the user's metadata dict
description = metadata.pop("description", "")
if missing_keys := set(dataframe.columns) - set(metadata):
raise ValueError(
f"Entity_Metadata is missing for columns: {', '.join(missing_keys)}"
)
if missing_keys := set(metadata) - set(dataframe.columns):
raise ValueError(
f"Entity_Metadata is missing for columns: {', '.join(missing_keys)}"
)
entities = {}
for name in metadata:
value = dataframe[name].to_numpy()
if len(value) == 1:
value = value[0]
if "ontology_label" in metadata[name]:
elem = me.Entity(
ontology_label=metadata[name]["ontology_label"],
value=value,
unit=metadata[name].get("unit"),
description=metadata[name].get("description", ""),
)
elif "unit" in metadata[name]:
elem = u.Quantity(
value=value,
unit=metadata[name]["unit"],
)
else:
elem = value
entities[name] = elem
return cls(description=description, **entities)
[docs]
def to_csv(self, filename: str | os.PathLike) -> None:
r"""Write collection to CSV file.
CSV files contain data in normal CSV format and additional metadata lines at the
top of the file. Some of the lines are commented with ``#``. This structure is
fixed and additional comment lines or inline comments in the data table are not
allowed.
The lines are, in order:
- (commented) the file version in the form ``mammos csv v<VERSION>`` (matching
regex v\d+)
- (commented, optional) a description of the file, appearing delimited by
dashed lines
- (optional, only for entities) the preferred ontology label
- (optional, only for entities) a description string
- (optional, only for entities) the ontology IRI
- (optional, for entities and quantities) units
- the short labels used to refer to individual columns when working with the
data, e.g. in a :py:class:`pandas.DataFrame` (omitting spaces in this string
is advisable; ideally this string is the short ontology label)
- all remaining lines contain data.
Elements in a line are separated by a comma without any surrounding whitespace.
A trailing comma is not permitted. Line continuation is OS dependent (\r\n on
Windows, \n on Unix).
In columns without ontology the lines containing labels, IRIs, and description
are empty.
Similarly, columns without units (with or without ontology entry) have empty
units line.
For any column, the description line can be empty. Only entities can store
descriptions, i.e., if the ontology-related lines are empty, the description
string will not be read.
.. version-added:: v2
The optional description of the file.
.. version-added:: v3
Additional description metadata row containing a description for each column.
.. version-changed:: v3
Ontology labels, entity descriptions, IRIs, and units are no longer
commented.
Args:
filename: Name of the generated file. An existing file with the same name
is overwritten without notice.
Raises:
ValueError: If the entities are not tabular. CSV files can only be written
for collections in which all entities are either scalar or
one-dimenisional with the same length.
ValueError: If elements of the collection are of type
:py:class:`~mammos_entity.EntityCollection` (nested collections are not
supported in CSV) or if the collection is empty.
Example:
Here is an example with five columns:
- an index with no units or ontology label
- the entity spontaneous magnetization with an entry in the ontology and a
description
- a made-up quantity alpha with a unit but no ontology label
- demagnetizing factor with an ontology entry but no unit
- a column `comment` containing a string comment without units or ontology
label
The file has a description reading "Test data".
>>> from pathlib import Path
>>> import mammos_entity as me
>>> import mammos_units as u
>>> collection = me.EntityCollection(
... description="Test data",
... index=[0, 1, 2],
... Ms=me.Entity("SpontaneousMagnetization", [1e2, 1e2, 1e2], "kA/m", description="Magnetization at 0 Kelvin"),
... alpha=[1.2, 3.4, 5.6] * u.s**2,
... DemagnetizingFactor=me.Entity("DemagnetizingFactor", [1, 0.5, 0.5]),
... comment=[
... "Comment in the first row",
... "Comment in the second row",
... "Comment in the third row",
... ],
... )
>>> collection.to_csv("example.csv")
The new file has the following content:
>>> print(Path("example.csv").read_text())
# mammos csv v3
#----------------------------------------
# Test data
#----------------------------------------
,SpontaneousMagnetization,,DemagnetizingFactor,
,Magnetization at 0 Kelvin,,,
,https://w3id.org/emmo/domain/magnetic-materials#EMMO_032731f8-874d-5efb-9c9d-6dafaa17ef25,,https://w3id.org/emmo/domain/magnetic-materials#EMMO_0f2b5cc9-d00a-5030-8448-99ba6b7dfd1e,
,kA / m,s2,,
index,Ms,alpha,DemagnetizingFactor,comment
0,100.0,1.2,1.0,Comment in the first row
1,100.0,3.4,0.5,Comment in the second row
2,100.0,5.6,0.5,Comment in the third row
<BLANKLINE>
Finally, remove the file.
>>> Path("example.csv").unlink()
""" # noqa: E501
if any(isinstance(element, EntityCollection) for _name, element in self):
raise ValueError("Nested collections cannot be saved to CSV.")
if len(self) == 0:
raise ValueError("Empty collections cannot be saved to CSV.")
# convert data first because that will catch incompatible shape
dataframe = self.to_dataframe()
# Header rows written in CSV format.
metadata_rows = [
[getattr(elem, "ontology_label", "") for _, elem in self],
[getattr(elem, "description", "") for _, elem in self],
[getattr(elem, "ontology_iri", "") for _, elem in self],
[str(getattr(elem, "unit", "")) for _, elem in self],
]
with open(filename, "w", newline="") as csvfile:
csvfile.write(f"# mammos csv v3{os.linesep}")
if self.description:
csvfile.write("#" + "-" * 40 + os.linesep)
for line in self.description.splitlines():
csvfile.write(f"# {line}{os.linesep}")
csvfile.write("#" + "-" * 40 + os.linesep)
writer = csv.writer(
csvfile,
delimiter=",",
quoting=csv.QUOTE_MINIMAL,
lineterminator=os.linesep,
)
writer.writerows(metadata_rows)
dataframe.to_csv(csvfile, index=False)
[docs]
def to_yaml(self, filename: str | os.PathLike) -> None:
r"""Write collection to YAML file.
MaMMoS YAML files have the following format:
- one commented line at the top of the file containing the mammos format version in the form `# mammos yaml v<version-number>`.
- a mapping with three top-level keys ``metadata``, ``description`` and ``data``
- ``metadata`` is currently unused and should be empty
- the ``description`` key contains a (multi-line) string with arbitrary content
describing the top-level collection
- ``data`` contains one key per element in the collection. Each entry is either
an entity-like entry or a nested collection node.
Collection nodes are recursive and have two keys ``description`` and ``data``:
- ``description``: a (multi-line) string with arbitrary content
- ``data``: mapping from entry names to entity-like entries or nested
collection nodes
Entity-like entries have the following keys:
- For :py:class:`~mammos_entity.Entity`:
- ``ontology_label``: label in the ontology
- ``description``: description string
- ``ontology_iri``: IRI of the entity
- ``unit``: unit of the entity (``""`` for dimensionless)
- ``value``: value of the data
- For :py:class:`~mammos_units.Quantity`:
- ``unit``: unit of the quantity
- ``value``: value of the data
- For any other value:
- ``value``: value of the data
.. version-added:: v2
The ``description`` key for each object.
.. version-changed:: v2
- The version of the file is now stored in the first commented line, previously it was stored in ``metadata:description``.
- The top-level collection description is stored under ``description``
(next to ``metadata`` and ``data``). Previously it was stored in
``metadata:description``.
- Non-entity entries no longer store null-valued ontology keys.
- Nested collections are supported recursively.
Args:
filename: Name of the generated file. An existing file with the same name
is overwritten without notice.
Raises:
ValueError: If the top-level collection is empty.
Example:
Here is an example with six entries:
- an index with no units or ontology label
- the entity spontaneous magnetization with an entry in the ontology and a
description
- a made-up quantity alpha with a unit but no ontology label
- demagnetizing factor with an ontology entry but no unit
- a column `comment` containing a string comment without units or ontology
label
- an element Tc with only a single value
The file has a description reading "Test data".
>>> from pathlib import Path
>>> import mammos_entity as me
>>> import mammos_units as u
>>> collection = me.EntityCollection(
... description="Test data",
... index=[0, 1, 2],
... Ms=me.Entity("SpontaneousMagnetization", [1e2, 1e2, 1e2], "kA/m", description="Magnetization at 0 Kelvin"),
... alpha=[1.2, 3.4, 5.6] * u.s**2,
... DemagnetizingFactor=me.Entity("DemagnetizingFactor", [1, 0.5, 0.5]),
... comment=[
... "Comment in the first row",
... "Comment in the second row",
... "Comment in the third row",
... ],
... Tc=me.Tc(300, "K"),
... )
>>> collection.to_yaml("example.yaml")
The new file has the following content:
>>> print(Path("example.yaml").read_text())
# mammos yaml v2
metadata: null
description: Test data
data:
index:
value: [0, 1, 2]
Ms:
ontology_label: SpontaneousMagnetization
description: Magnetization at 0 Kelvin
ontology_iri: https://w3id.org/emmo/domain/magnetic-materials#EMMO_032731f8-874d-5efb-9c9d-6dafaa17ef25
unit: kA / m
value: [100.0, 100.0, 100.0]
alpha:
unit: s2
value: [1.2, 3.4, 5.6]
DemagnetizingFactor:
ontology_label: DemagnetizingFactor
description: ''
ontology_iri: https://w3id.org/emmo/domain/magnetic-materials#EMMO_0f2b5cc9-d00a-5030-8448-99ba6b7dfd1e
unit: ''
value: [1.0, 0.5, 0.5]
comment:
value: [Comment in the first row, Comment in the second row, Comment in the third
row]
Tc:
ontology_label: CurieTemperature
description: ''
ontology_iri: https://w3id.org/emmo#EMMO_6b5af5a8_a2d8_4353_a1d6_54c9f778343d
unit: K
value: 300.0
<BLANKLINE>
>>> Path("example.yaml").unlink()
Here is a second example with one outer and one inner collection:
>>> properties = me.EntityCollection(
... description="material properties",
... Ms=me.Ms(1.3e3, "kA/m"),
... Tc=me.Tc(1043, "K"),
... )
>>> measurement = me.EntityCollection(
... description="measurement with device X",
... sample=properties,
... T=me.T(300, "K", description="Measurement conditions"),
... H=me.H([0, 50, 100], "kA/m"),
... M=me.M([100, 300, 500], "kA/m"),
... )
>>> measurement.to_yaml("nested_example.yaml")
>>> print(Path("nested_example.yaml").read_text())
# mammos yaml v2
metadata: null
description: measurement with device X
data:
sample:
description: material properties
data:
Ms:
ontology_label: SpontaneousMagnetization
description: ''
ontology_iri: https://w3id.org/emmo/domain/magnetic-materials#EMMO_032731f8-874d-5efb-9c9d-6dafaa17ef25
unit: kA / m
value: 1300.0
Tc:
ontology_label: CurieTemperature
description: ''
ontology_iri: https://w3id.org/emmo#EMMO_6b5af5a8_a2d8_4353_a1d6_54c9f778343d
unit: K
value: 1043.0
T:
ontology_label: ThermodynamicTemperature
description: Measurement conditions
ontology_iri: https://w3id.org/emmo#EMMO_affe07e4_e9bc_4852_86c6_69e26182a17f
unit: K
value: 300.0
H:
ontology_label: ExternalMagneticField
description: ''
ontology_iri: https://w3id.org/emmo/domain/magnetic-materials#EMMO_da08f0d3-fe19-58bc-8fb6-ecc8992d5eb3
unit: kA / m
value: [0.0, 50.0, 100.0]
M:
ontology_label: Magnetization
description: ''
ontology_iri: https://w3id.org/emmo#EMMO_b23e7251_a488_4732_8268_027ad76d7e37
unit: kA / m
value: [100.0, 300.0, 500.0]
<BLANKLINE>
>>> Path("nested_example.yaml").unlink()
""" # noqa: E501
def _serialize_entity_like(
element: mammos_entity.Entity
| mammos_units.Quantity
| numpy.typing.ArrayLike,
) -> dict:
if isinstance(element, me.Entity):
return {
"ontology_label": element.ontology_label,
"description": element.description,
"ontology_iri": element.ontology_iri,
"unit": str(element.unit),
"value": element.value.tolist(),
}
elif isinstance(element, u.Quantity):
return {
"unit": str(element.unit),
"value": element.value.tolist(),
}
else:
return {"value": np.asanyarray(element).tolist()}
if len(self) == 0:
raise ValueError("Empty collections cannot be saved to YAML.")
def _serialize_collection(collection: EntityCollection) -> dict:
result = {"description": collection.description, "data": {}}
for name, element in collection:
if isinstance(element, EntityCollection):
result["data"][name] = _serialize_collection(element)
else:
result["data"][name] = _serialize_entity_like(element)
return result
entity_dict = {"metadata": None, **_serialize_collection(self)}
# custom dumper to change style of lists, tuples and multi-line strings
class _Dumper(yaml.SafeDumper):
pass
def _represent_sequence(dumper, value):
"""Display sequence with flow style.
A list [1, 2, 3] for key `value` is written to file as::
value: [1, 2, 3]
instead of::
value:
- 1
- 2
- 3
"""
return dumper.represent_sequence(
"tag:yaml.org,2002:seq", value, flow_style=True
)
def _represent_string(dumper, value):
"""Control style of single-line and multi-line strings.
Single-line strings are written as::
some_key: Hello
Multi-line strings are written as::
some_key: |-
I am multi-line,
without a trailing new line.
"""
style = "|" if "\n" in value else ""
return dumper.represent_scalar("tag:yaml.org,2002:str", value, style=style)
_Dumper.add_representer(list, _represent_sequence)
_Dumper.add_representer(tuple, _represent_sequence)
_Dumper.add_representer(str, _represent_string)
with open(filename, "w") as f:
f.write("# mammos yaml v2\n")
yaml.dump(
entity_dict,
stream=f,
Dumper=_Dumper,
default_flow_style=False,
sort_keys=False,
)
[docs]
def to_hdf5(
self, base: h5py.File | h5py.Group | str | os.PathLike, name: str | None = None
) -> h5py.Group | None:
"""Write a collection to an HDF5 group.
Entities of the collection become datasets in the group. The collection
description is added to the group attributes.
Args:
base: If it is an open HDF5 file or a group in an HDF5 file, data will be
added to it as new group. If it is a str or PathLike a new HDF5 file
with the given name will be created. If a file with that name exists
already, it will be overwritten without notice.
name: Name for the newly created group. If an element with that name
exists already in `base` the function will fail. If `name` is ``None``
entities of the collection will be added directly to `base` and the
collection description will be added to `base` attributes.
Returns:
If `base` is an open `File` or `Group` the newly created group. If `base` is
a file name nothing is returned (because the file created internally will be
closed before the function returns).
"""
return _to_hdf5(self, base, name)
def _to_hdf5(
data: mammos_entity.Entity
| mammos_units.Quantity
| numpy.typing.ArrayLike
| mammos_entity.EntityCollection,
base: h5py.File | h5py.Group | str | os.PathLike,
name: str | None,
record_mammos_entity_version: bool = True,
) -> h5py.Dataset | h5py.Group | None:
"""Internal implementation with additional options required for recursion.
Args:
data: <see public method>
base: <see public method>
name: <see public method>
record_mammos_entity_version: add mammos_entity version to group/dataset
attributes.
"""
if isinstance(base, str | os.PathLike):
with h5py.File(base, "w") as f:
_to_hdf5(data, f, name)
return
if isinstance(data, EntityCollection):
group = base.create_group(name, track_order=True) if name is not None else base
group.attrs["description"] = data.description
if record_mammos_entity_version:
group.attrs["mammos_entity_version"] = me.__version__
for name, entity_like in data:
_to_hdf5(entity_like, group, name, record_mammos_entity_version=False)
return group
else:
if name is None:
raise ValueError("'name' must not be None when 'data' is entity-like.")
if isinstance(data, me.Entity):
dset = data._to_hdf5(base, name, record_mammos_entity_version=False)
elif isinstance(data, u.Quantity):
dset = base.create_dataset(name, data=data.value)
dset.attrs["unit"] = str(data.unit)
else:
dset = base.create_dataset(name, data=data)
if record_mammos_entity_version:
dset.attrs["mammos_entity_version"] = me.__version__
return dset