Source code for zema_emc_annotated.dataset

"""An API for accessing the data in the ZeMA remaining-useful life dataset"""

__all__ = [
    "ExtractionDataType",
    "ZeMASamples",
    "ZEMA_DATASET_HASH",
    "ZEMA_DATASET_URL",
    "ZEMA_QUANTITIES",
]

import operator
import os
import pickle
from enum import Enum
from functools import reduce
from os.path import exists
from pathlib import Path
from typing import cast

import h5py
import numpy as np
from h5py import Dataset
from numpy._typing import NDArray
from pooch import os_cache, retrieve

from zema_emc_annotated.data_types import (
    RealMatrix,
    RealVector,
    SampleSize,
    UncertainArray,
)

ZEMA_DATASET_HASH = (
    "sha256:fb0e80de4e8928ae8b859ad9668a1b6ea6310028a6690bb8d4c1abee31cb8833"
)
ZEMA_DATASET_URL = "https://zenodo.org/record/5185953/files/axis11_2kHz_ZeMA_PTB_SI.h5"
ZEMA_QUANTITIES = (
    "Acceleration",
    "Active_Current",
    "Force",
    "Motor_Current",
    "Pressure",
    "Sound_Pressure",
    "Velocity",
)


[docs]class ExtractionDataType(Enum): """Identifiers of data types in ZeMA dataset Attributes ---------- VALUES : str with value ``qudt:value`` UNCERTAINTIES : str with value ``qudt:standardUncertainty`` """ VALUES = "qudt:value" UNCERTAINTIES = "qudt:standardUncertainty"
[docs]class ZeMASamples: """Extracts requested number of samples of values with associated uncertainties The underlying dataset is the annotated "Sensor data set of one electromechanical cylinder at ZeMA testbed (ZeMA DAQ and Smart-Up Unit)" by Dorst et al. [Dorst2021]_. Each extracted sample will be cached in the download directory of the file, which is handled by :func:`pooch.os_cache`, where ``<AppName>`` evaluates to ``pooch``. That way the concurrent retrieval of the same data is as performant as possible and can simply be left to ``zema_emc_annotated``. Where ever the result of ``ZeMASamples`` is needed in an external code base, it should be safe to call it over and over without causing unnecessary extractions or even downloads. The underlying mechanism is Python's built-in ``pickle``. Parameters ---------- sample_size : SampleSize, optional tuple containing information about which samples to extract, defaults to default of :class:`~zema_emc_annotated.data_types.SampleSize` normalize : bool, optional if ``True``, then values are centered around zero and values and uncertainties are scaled to values' unit std, defaults to ``False`` skip_hash_check : bool, optional allow to circumvent strict hash checking during the retrieve of dataset file, to speed up concurrent calls as each check for the large file might take several seconds, defaults to ``False`` Attributes ---------- uncertain_values : UncertainArray The collection of samples of values with associated uncertainties, will be of shape (``sample_size.n_cycles``, 11 x ``sample_size.datapoints_per_cycle``) """ uncertain_values: UncertainArray def __init__( self, sample_size: SampleSize = SampleSize(), normalize: bool = False, skip_hash_check: bool = False, ): self.samples_slice: slice = np.s_[ sample_size.idx_first_cycle : sample_size.idx_first_cycle + sample_size.n_cycles ] self.size_scaler = sample_size.datapoints_per_cycle if cached_data := self._check_and_load_cache(normalize): self.uncertain_values = cached_data else: self._uncertainties = np.empty((sample_size.n_cycles, 0)) self._values = np.empty((sample_size.n_cycles, 0)) self.uncertain_values = self._extract_data(normalize, skip_hash_check) self._store_cache(normalize) del self._uncertainties del self._values def _extract_data( self, normalize: bool, skip_hash_check: bool = True ) -> UncertainArray: """Extract the data as specified""" dataset_full_path = retrieve( url=ZEMA_DATASET_URL, known_hash=None if skip_hash_check else ZEMA_DATASET_HASH, progressbar=True, ) assert exists(dataset_full_path) relevant_datasets = ( ["ZeMA_DAQ", quantity, datatype.value] for quantity in ZEMA_QUANTITIES for datatype in ExtractionDataType ) self._normalization_divisors: dict[str, NDArray[np.double] | float] = {} with h5py.File(dataset_full_path, "r") as h5f: for dataset_descriptor in relevant_datasets: self._current_dataset: Dataset = cast( Dataset, reduce(operator.getitem, dataset_descriptor, h5f) ) if ExtractionDataType.VALUES.value in self._current_dataset.name: treating_values = True print(f" Extract values from {self._current_dataset.name}") else: treating_values = False print( f" Extract uncertainties from " f"{self._current_dataset.name}" ) if self._current_dataset.shape[0] == 3: for idx, sensor in enumerate(self._current_dataset): if treating_values: self._normalize_values_if_requested_and_append( sensor, self._extract_sub_dataset_name(idx), normalize, ) else: self._normalize_uncertainties_if_requested_and_append( sensor, self._extract_sub_dataset_name(idx), normalize, ) else: if treating_values: self._normalize_values_if_requested_and_append( self._current_dataset, self._strip_data_type_from_dataset_descriptor(), normalize, ) else: self._normalize_uncertainties_if_requested_and_append( self._current_dataset, self._strip_data_type_from_dataset_descriptor(), normalize, ) if treating_values: print(" Values extracted") else: print(" Uncertainties extracted") return UncertainArray(self._values, self._uncertainties) def _normalize_values_if_requested_and_append( self, values: Dataset, dataset_descriptor: str, normalize: bool ) -> None: """Normalize the provided values and append according to current state""" _potentially_normalized_values = values[ np.s_[: self.size_scaler, self.samples_slice] ] if normalize: _potentially_normalized_values -= np.mean( values[:, self.samples_slice], axis=0 ) data_std = np.std(values[:, self.samples_slice], axis=0) data_std[data_std == 0] = 1.0 self._normalization_divisors[dataset_descriptor] = data_std _potentially_normalized_values /= self._normalization_divisors[ dataset_descriptor ] self._values = np.append( self._values, _potentially_normalized_values.transpose(), axis=1 ) def _normalize_uncertainties_if_requested_and_append( self, uncertainties: Dataset, dataset_descriptor: str, normalize: bool ) -> None: """Normalize the provided uncertainties and append according to current state""" _potentially_normalized_uncertainties = uncertainties[ np.s_[: self.size_scaler, self.samples_slice] ] if normalize: _potentially_normalized_uncertainties /= self._normalization_divisors[ dataset_descriptor ] self._uncertainties = np.append( self._uncertainties, _potentially_normalized_uncertainties.transpose(), axis=1, ) def _extract_sub_dataset_name(self, idx: int) -> str: return str( self._strip_data_type_from_dataset_descriptor() + self._current_dataset.attrs["si:label"] .split(",")[idx] .strip("[") .strip("]") .replace(" ", "") .replace('"', "") .replace("uncertainty", "") ).replace("\n", "") def _strip_data_type_from_dataset_descriptor(self) -> str: return str( self._current_dataset.name.replace( ExtractionDataType.UNCERTAINTIES.value, "" ).replace(ExtractionDataType.VALUES.value, "") ) @property def values(self) -> RealVector: """The values of the stored :class:`UncertainArray` object""" return self.uncertain_values.values @property def uncertainties(self) -> RealMatrix | RealVector: """The uncertainties of the stored :class:`UncertainArray` object""" return self.uncertain_values.uncertainties def _check_and_load_cache(self, normalize: bool) -> UncertainArray | None: """Checks if corresponding file for n_cycles exists and loads it with pickle""" if os.path.exists(cache_path := self._cache_path(normalize)): with open(cache_path, "rb") as cache_file: return cast(UncertainArray, pickle.load(cache_file)) return None def _cache_path(self, normalize: bool) -> Path: """Local file system path for a cache file containing n ZeMA samples The result does not guarantee, that the file at the specified location exists, but can be used to check for existence or creation. """ assert self.samples_slice.stop is not None # pylint: disable=no-member idx_start = self.samples_slice.start # pylint: disable=no-member n_samples = ( self.samples_slice.stop - idx_start # pylint: disable=no-member if self.samples_slice.start is not None # pylint: disable=no-member else self.samples_slice.stop # pylint: disable=no-member ) return Path( os_cache("pooch").joinpath( f"{str(n_samples)}_samples" f"{'_starting_from_' + str(idx_start) if idx_start else ''}_with_" f"{str(self.size_scaler)}_values_per_sensor" f"{'_normalized' if normalize else ''}.pickle" ) ) def _store_cache(self, normalize: bool) -> None: """Dumps provided uncertain tensor to corresponding pickle file""" with open(self._cache_path(normalize), "wb") as cache_file: pickle.dump(self.uncertain_values, cache_file)