Coverage for ebcpy/data_types.py: 96%
253 statements
« prev ^ index » next coverage.py v7.4.4, created at 2026-05-29 13:01 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2026-05-29 13:01 +0000
1"""
2This module provides useful classes for all ebcpy.
3Every data_type class should include every parameter
4other classes like optimization etc. may need. The checking
5of correct input is especially relevant here as the correct
6format of data-types will prevent errors during simulations,
7optimization etc.
8"""
10import os
11import warnings
12from pathlib import Path
13from typing import List, Union, Any, TYPE_CHECKING
14from datetime import datetime
15from pandas.core.internals import BlockManager
16import pandas as pd
17import numpy as np
18import ebcpy.modelica.simres as sr
20from ebcpy.utils import get_names
21from ebcpy import preprocessing
23# pylint: disable=I1101
24# pylint: disable=too-many-ancestors
26__all__ = ['TimeSeries',
27 'TimeSeriesData',
28 'numeric_index_dtypes',
29 'load_time_series_data',
30 'index_is_numeric',
31 'datetime_indexes']
33numeric_index_dtypes = [
34 pd.Index([], dtype=dtype).dtype for dtype in
35 ["int8", "int16", "int32", "int64",
36 "uint8", "uint16", "uint32", "uint64",
37 "float32", "float64"]
38]
40datetime_indexes = [
41 pd.DatetimeIndex
42]
45def index_is_numeric(index: pd.Index):
46 """Check if pandas Index is numeric"""
47 return isinstance(index, pd.RangeIndex) or index.dtype in numeric_index_dtypes
50@pd.api.extensions.register_dataframe_accessor("tsd")
51class TimeSeriesAccessor:
52 """
53 Pandas DataFrame accessor for time series functionality.
54 Access using df.tsd.*
55 """
57 def __init__(self, pandas_obj):
58 self._obj = pandas_obj
59 self._filepath = None
61 @property
62 def filepath(self):
63 """Get the filepath associated with the time series data"""
64 return self._filepath
66 @filepath.setter
67 def filepath(self, filepath):
68 """Set the filepath associated with the time series data"""
69 self._filepath = Path(filepath) if filepath else None
71 def save(self, filepath: str = None, **kwargs) -> None:
72 """
73 Save the current time-series-data into the given file-format.
74 Currently supported are .hdf, which is an easy and fast storage,
75 and, .csv is supported as an easy-readable option.
76 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME
77 are supported.
79 :param str,os.path.normpath filepath:
80 Filepath were to store the data. Either .hdf, .csv, .parquet
81 or .parquet.COMPRESSION_NAME has to be the file-ending.
82 Default is current filepath of class.
83 :keyword str key:
84 Necessary keyword-argument for saving a .hdf-file.
85 Specifies the key of the table in the .hdf-file.
86 :keyword str sep:
87 Separator used for saving as .csv. Default is ','.
88 :keyword str engine:
89 Chose the engine for reading .parquet files. Default is 'pyarrow'
90 Other option is 'fastparquet' (python>=3.9).
91 """
92 # Set filepath if not given
93 if filepath is None:
94 if self.filepath is None:
95 raise FileNotFoundError(
96 "TimeSeriesData has neither a filepath stored in tsd "
97 "accessor nor did you provide a filepath were to store the data."
98 )
99 filepath = self.filepath
100 else:
101 filepath = Path(filepath)
103 # Check if filepath is still None
104 if filepath is None:
105 raise ValueError("No filepath specified and no default filepath is set.")
107 # Save based on file suffix
108 if filepath.suffix == ".hdf":
109 if "key" not in kwargs:
110 raise KeyError("Argument 'key' must be specified to save a .hdf file")
111 self._obj.to_hdf(filepath, key=kwargs.get("key"))
112 elif filepath.suffix == ".csv":
113 self._obj.to_csv(filepath, sep=kwargs.get("sep", ","))
114 elif ".parquet" in filepath.name:
115 parquet_split = filepath.name.split(".parquet")
116 # Parquet doesn't support SparseDtype — densify before writing
117 df_to_save = self._obj.copy()
118 for col in df_to_save.columns:
119 if isinstance(df_to_save[col].dtype, pd.SparseDtype):
120 df_to_save[col] = df_to_save[col].sparse.to_dense()
121 df_to_save.to_parquet(
122 filepath, engine=kwargs.get('engine', 'pyarrow'),
123 compression=parquet_split[-1][1:] if parquet_split[-1] else None,
124 index=True
125 )
126 else:
127 raise TypeError("Given file-format is not supported."
128 "You can only store time series data as .hdf, .csv, .parquet, "
129 "and .parquet.COMPRESSION_NAME with additional compression options")
131 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace=True):
132 """
133 Convert the current index to a datetime index using
134 ebcpy.preprocessing.convert_index_to_datetime_index()
136 :param str unit_of_index: default 's'
137 The unit of the given index. Used to convert to
138 total_seconds later on.
139 :param datetime.datetime origin:
140 The reference datetime object for the first index.
141 Default is the current system time.
142 :param bool inplace:
143 If True, performs operation inplace and returns None.
144 :return: df
145 Copy of DataFrame with correct index for usage in this
146 framework.
147 """
148 return preprocessing.convert_index_to_datetime_index(
149 df=self._obj,
150 unit_of_index=unit_of_index,
151 origin=origin,
152 inplace=inplace
153 )
155 def to_float_index(self, offset=0, inplace=True):
156 """
157 Convert the current index to a float based index using
158 ebcpy.preprocessing.convert_datetime_index_to_float_index()
160 :param float offset:
161 Offset in seconds
162 :param bool inplace:
163 If True, performs operation inplace and returns None.
164 :return: pd.DataFrame df:
165 DataFrame with correct index.
166 """
167 if not isinstance(self._obj.index, pd.DatetimeIndex):
168 if inplace:
169 return None
170 return self._obj
172 return preprocessing.convert_datetime_index_to_float_index(
173 df=self._obj,
174 offset=offset,
175 inplace=inplace
176 )
178 def clean_and_space_equally(self, desired_freq, inplace=False):
179 """
180 Call to the preprocessing function
181 ebcpy.preprocessing.clean_and_space_equally_time_series()
182 See the docstring of this function to know what is happening.
184 :param str desired_freq:
185 Frequency to determine number of elements in processed dataframe.
186 Options are for example:
187 - s: second-based
188 - 5s: Every 5 seconds
189 - 6min: Every 6 minutes
190 This also works for h, d, m, y, ms etc.
191 :param bool inplace:
192 Is Deprecated and use allways the return value!
193 :return: pd.DataFrame
194 Cleaned and equally spaced data-frame
195 """
196 df = preprocessing.clean_and_space_equally_time_series(
197 df=self._obj,
198 desired_freq=desired_freq
199 )
200 if inplace:
201 warnings.warn(
202 "inplace=True on clean_and_space_equally has no effect when called "
203 "via the .tsd accessor. Use the return value instead: "
204 "df = df.tsd.clean_and_space_equally(freq, inplace=False)",
205 FutureWarning,
206 stacklevel=2,
207 )
208 return df
210 def low_pass_filter(self, crit_freq, filter_order, variable):
211 """
212 Call to the preprocessing function
213 ebcpy.preprocessing.low_pass_filter()
214 See the docstring of this function to know what is happening.
216 :param float crit_freq:
217 The critical frequency or frequencies.
218 :param int filter_order:
219 The order of the filter
220 :param str variable:
221 The variable name to apply the filter to
222 :return: numpy.ndarray
223 Filtered data
224 """
225 return preprocessing.low_pass_filter(
226 data=self._obj[variable].to_numpy(),
227 filter_order=filter_order,
228 crit_freq=crit_freq
229 )
231 def moving_average(self, window, variable):
232 """
233 Call to the preprocessing function
234 ebcpy.preprocessing.moving_average()
235 See the docstring of this function to know what is happening.
237 :param int window:
238 sample rate of input
239 :param str variable:
240 The variable name to apply the filter to
241 :return: numpy.ndarray
242 Moving average result
243 """
244 return preprocessing.moving_average(
245 data=self._obj[variable].to_numpy(),
246 window=window,
247 )
249 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]:
250 """
251 Return an alphabetically sorted list of variable names, optionally filtered by patterns.
253 By default, returns all column names found in the DataFrame, sorted alphabetically.
254 If `patterns` is provided, only names matching one or more of the given
255 literal strings or glob-style patterns (where `*` matches any sequence of characters)
256 will be returned.
258 :param patterns:
259 - A single string or list of strings.
260 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard.
261 - If None, all variable names are returned.
262 :return:
263 A list of matching variable names, in alphabetical order.
264 :raises KeyError:
265 If any literal name or pattern does not match at least one variable in the DataFrame.
267 Example:
268 # return all wall temperatures at any layer
269 df.tsd.get_variable_names("*wall.layer[*].T")
270 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"]
271 """
272 all_names = sorted(self._obj.columns.get_level_values(0).unique())
273 if patterns is None:
274 return all_names
275 return get_names(all_names, patterns)
277 def number_lines_totally_na(self):
278 """
279 Returns the number of rows in the given dataframe
280 that are filled with NaN-values.
281 """
282 return preprocessing.number_lines_totally_na(self._obj)
284 @property
285 def frequency(self):
286 """
287 The frequency of the time series data.
288 Returns's the mean and the standard deviation of
289 the index.
291 :returns:
292 float: Mean value
293 float: Standard deviation
294 """
295 return preprocessing.get_df_index_frequency_mean_and_std(
296 df_index=self._obj.index
297 )
300class TimeSeriesData(pd.DataFrame):
301 """
302 Most data related to energy and building
303 climate related problems is time-variant.
305 Class for handling time series data using a pandas dataframe.
306 This class works file-based and makes the import of different
307 file-types into a pandas DataFrame more user-friendly.
308 Furthermore, functions to support multi-indexing are provided to
309 efficiently handle variable passed processing and provide easy
310 visualization and preprocessing access.
312 :param str,os.path.normpath,pd.DataFrame data:
313 Filepath ending with either .hdf, .mat, .csv, .parquet,
314 or .parquet.COMPRESSION_NAME containing
315 time-dependent data to be loaded as a pandas.DataFrame.
316 Alternative option is to pass a DataFrame directly.
317 :keyword str key:
318 Name of the table in a .hdf-file if the file
319 contains multiple tables.
320 :keyword str sep:
321 separator for the use of a csv file. If none is provided,
322 a comma (",") is used as a default value.
323 See pandas.read_csv() docs for further information.
324 :keyword int, list header:
325 Header columns for .csv files.
326 See pandas.read_csv() docs for further information.
327 Default is first row (0).
328 :keyword int,str index_col:
329 Column to be used as index in .csv files.
330 See pandas.read_csv() docs for further information.
331 Default is first column (0).
332 :keyword str sheet_name:
333 Name of the sheet you want to load data from. Required keyword
334 argument when loading a xlsx-file.
335 :keyword str default_tag:
336 Which value to use as tag. Default is 'raw'
337 :keyword str engine:
338 Chose the engine for reading .parquet files. Default is 'pyarrow'
339 Other option is 'fastparquet' (python>=3.9).
340 :keyword list variable_names:
341 List of variable names to load from .mat file. If you
342 know which variables you want to plot, this may speed up
343 loading significantly, and reduce memory size drastically.
344 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.)
345 to match multiple variables at once.
347 Examples:
349 First let's see the usage for a common dataframe.
351 >>> import numpy as np
352 >>> from ebcpy import TimeSeriesData
353 >>> tsd = TimeSeriesData({"my_variable": np.random.rand(5)})
354 >>> tsd.to_datetime_index()
355 >>> tsd.save("my_new_data.csv")
357 Now, let's load the recently created file.
359 >>> tsd = TimeSeriesData("my_new_data.csv")
360 """
362 # normal properties
363 _metadata = [
364 "_filepath",
365 "_loader_kwargs",
366 "_default_tag",
367 "_multi_col_names"
368 ]
370 def __init__(self, data: Union[str, Any], use_multicolumn: bool = False, **kwargs):
371 """Initialize class-objects and check correct input."""
372 warnings.warn(
373 "TimeSeriesData will be deprecated in the next major release. "
374 "Instead, use 'load_time_series_data' to load files etc. as pd.DataFrame "
375 "and use the 'tsd' accessor to access useful time-series-related functions "
376 "as before with TimeSeriesData.", FutureWarning
377 )
378 if use_multicolumn:
379 warnings.warn(
380 "All multicolumn support will be removed in the next major release", FutureWarning
381 )
382 # Initialize as default
383 self._filepath = None
384 self._loader_kwargs = {}
385 self._multi_col_names = ["Variables", "Tags"]
386 self._default_tag = kwargs.pop("default_tag", "raw")
387 if not isinstance(self._default_tag, str):
388 raise TypeError(f"Invalid type for default_tag! Expected 'str' but "
389 f"received {type(self._default_tag)}")
391 # Two possibles inputs. first argument is actually data provided by pandas
392 # and kwargs hold further information or is it an actual filepath.
393 if isinstance(data, BlockManager):
394 super().__init__(data=data)
395 return
397 if not isinstance(data, (str, Path)):
398 _df_loaded = pd.DataFrame(data=data,
399 index=kwargs.get("index", None),
400 columns=kwargs.get("columns", None),
401 dtype=kwargs.get("dtype", None),
402 copy=kwargs.get("copy", False))
403 else:
404 file = Path(data)
405 self._loader_kwargs = kwargs.copy()
406 _df_loaded = _load_df_from_file(file=file, **self._loader_kwargs)
407 _df_loaded.tsd.filepath = file
408 self._filepath = file
410 if _df_loaded.columns.nlevels == 1:
411 # Check if first level is named Tags.
412 # If so, don't create MultiIndex-DF as the method is called by the pd constructor
413 if _df_loaded.columns.name != self._multi_col_names[1] and use_multicolumn:
414 multi_col = pd.MultiIndex.from_product(
415 [_df_loaded.columns, [self._default_tag]],
416 names=self._multi_col_names
417 )
418 _df_loaded.columns = multi_col
420 elif _df_loaded.columns.nlevels == 2:
421 if _df_loaded.columns.names != self._multi_col_names and use_multicolumn:
422 raise TypeError("Loaded dataframe has a different 2-Level "
423 "header format than it is supported by this "
424 "class. The names have to match.")
425 else:
426 raise TypeError("Only DataFrames with Multi-Columns with 2 "
427 "Levels are supported by this class.")
429 super().__init__(_df_loaded)
431 @property
432 def _constructor(self):
433 """Overwrite constructor method according to:
434 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas"""
435 return TimeSeriesData
437 @property
438 def _constructor_sliced(self):
439 """Overwrite constructor method according to:
440 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas"""
441 return TimeSeries
443 @property
444 def filepath(self) -> str:
445 """Get the filepath associated with the time series data"""
446 return self._filepath
448 @filepath.setter
449 def filepath(self, filepath: str):
450 """Set the filepath associated with the time series data"""
451 self._filepath = Path(filepath)
452 self.tsd.filepath = self._filepath
454 @property
455 def default_tag(self) -> str:
456 """Get the default of time series data object"""
457 return self._default_tag
459 @default_tag.setter
460 def default_tag(self, tag: str) -> None:
461 """Set the default_tag of the time series data object
462 :param tag: new tag
463 :type tag: String
464 """
465 if not isinstance(tag, str):
466 raise TypeError(f"Invalid type for default_tag! Expected 'str' but "
467 f"received {type(tag)}")
468 if tag not in self.get_tags():
469 raise KeyError(f"Tag '{tag}' does not exist for current data set!"
470 f"\n Available tags: {self.get_tags()}")
471 self._default_tag = tag
473 def save(self, filepath: str = None, **kwargs) -> None:
474 """
475 Save the current time-series-data into the given file-format.
476 Currently supported are .hdf, which is an easy and fast storage,
477 and, .csv is supported as an easy-readable option.
478 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME
479 are supported. Compressions could be gzip, brotli or snappy. For all possible
480 compressions see the documentation of the parquet engines.
481 For a small comparison of these data formats see https://github.com/RWTH-EBC/ebcpy/issues/81
483 :param str,os.path.normpath filepath:
484 Filepath were to store the data. Either .hdf, .csv, .parquet
485 or .parquet.COMPRESSION_NAME has to be the file-ending.
486 Default is current filepath of class.
487 :keyword str key:
488 Necessary keyword-argument for saving a .hdf-file.
489 Specifies the key of the table in the .hdf-file.
490 :keyword str sep:
491 Separator used for saving as .csv. Default is ','.
492 :keyword str engine:
493 Chose the engine for reading .parquet files. Default is 'pyarrow'
494 Other option is 'fastparquet' (python>=3.9).
495 :return:
496 """
497 # If new settings are needed, update existing ones
498 self._loader_kwargs.update(kwargs)
499 self.tsd.save(filepath, **kwargs)
501 def to_df(self, force_single_index=False):
502 """
503 Return the dataframe version of the current TimeSeriesData object.
504 If all tags are equal, the tags are dropped.
505 Else, the object is just converted.
507 :param bool force_single_index:
508 If True (not the default), the conversion to a standard
509 DataFrame with a single index column (only variable names)
510 is only done if no variable contains multiple tags.
511 """
512 if len(self.get_variables_with_multiple_tags()) == 0:
513 if self._is_old_multicolumn_format:
514 return pd.DataFrame(self.droplevel(1, axis=1))
515 return pd.DataFrame(self)
516 if force_single_index:
517 raise IndexError(
518 "Can't automatically drop all tags "
519 "as the following variables contain multiple tags: "
520 f"{' ,'.join(self.get_variables_with_multiple_tags())}. "
521 )
522 return pd.DataFrame(self)
524 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]:
525 """
526 Return an alphabetically sorted list of variable names, optionally filtered by patterns.
528 By default, returns all variable names found in the first level of the DataFrame's
529 column MultiIndex, sorted alphabetically. If `patterns` is provided, only names
530 matching one or more of the given literal strings or glob-style patterns
531 (where `*` matches any sequence of characters) will be returned.
533 :param patterns:
534 - A single string or list of strings.
535 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard.
536 - If None, all variable names are returned.
537 :return:
538 A list of matching variable names, in alphabetical order.
539 :raises KeyError:
540 If any literal name or pattern does not match at least one variable in the DataFrame.
542 Example:
543 # return all wall temperatures at any layer
544 tsd.get_variable_names("*wall.layer[*].T")
545 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"]
546 """
547 return self.tsd.get_variable_names(patterns)
549 def get_variables_with_multiple_tags(self) -> List[str]:
550 """
551 Return an alphabetically sorted list of all variables
552 that contain more than one tag.
554 :return: List[str]
555 """
556 var_names = self.columns.get_level_values(0)
557 return sorted(var_names[var_names.duplicated()])
559 def get_tags(self, variable: str = None) -> List[str]:
560 """
561 Return an alphabetically sorted list of all tags
563 :param str variable:
564 If given, tags of this variable are returned
566 :return: List[str]
567 """
568 if not self._is_old_multicolumn_format:
569 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!")
570 if variable:
571 tags = self.loc[:, variable].columns
572 return sorted(tags)
573 return sorted(self.columns.get_level_values(1).unique())
575 @property
576 def _is_old_multicolumn_format(self):
577 """
578 Helper function to check if the old multicolumn format is used.
579 """
580 return isinstance(self.columns, pd.MultiIndex)
582 def get_columns_by_tag(self,
583 tag: str,
584 variables: list = None,
585 return_type: str = 'pandas',
586 drop_level: bool = False):
587 """
588 Returning all columns with defined tag in the form of ndarray.
590 :param str tag:
591 Define the tag which return columns have to
592 match.
593 :param list variables:
594 Besides the given tag, specify the
595 variables names matching the return criteria as well.
596 :param boolean drop_level:
597 If tag should be included in the response.
598 Default is True.
599 :param str return_type:
600 Return format. Options are:
601 - pandas (pd.series)
602 - numpy, scipy, sp, and np (np.array)
603 - control (transposed np.array)
604 :return: ndarray of input signals
605 """
606 if not self._is_old_multicolumn_format:
607 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!")
609 # Extract columns
610 if variables:
611 _ret = self.loc[:, variables]
612 else:
613 _ret = self
615 _ret = _ret.xs(tag, axis=1, level=1, drop_level=drop_level)
617 # Return based on the given return_type
618 if return_type.lower() == 'pandas':
619 return _ret
620 if return_type.lower() in ['numpy', 'scipy', 'sp', 'np']:
621 return _ret.to_numpy()
622 if return_type.lower() == 'control':
623 return _ret.to_numpy().transpose()
624 raise TypeError("Unknown return type")
626 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace: bool = True):
627 """
628 Convert the current index to a float based index using
629 ebcpy.preprocessing.convert_index_to_datetime_index()
631 :param str unit_of_index: default 's'
632 The unit of the given index. Used to convert to
633 total_seconds later on.
634 :param datetime.datetime origin:
635 The reference datetime object for the first index.
636 Default is the current system time.
637 :param bool inplace:
638 If True, performs operation inplace and returns None.
639 :return: df
640 Copy of DataFrame with correct index for usage in this
641 framework.
643 """
644 return self.tsd.to_datetime_index(unit_of_index, origin, inplace)
646 def to_float_index(self, offset=0, inplace: bool = True):
647 """
648 Convert the current index to a float based index using
649 ebcpy.preprocessing.convert_datetime_index_to_float_index()
651 :param float offset:
652 Offset in seconds
653 :param bool inplace:
654 If True, performs operation inplace and returns None.
655 :return: pd.DataFrame df:
656 DataFrame with correct index.
657 """
658 return self.tsd.to_float_index(offset, inplace)
660 def clean_and_space_equally(self, desired_freq, inplace: bool = False):
661 """
662 Call to the preprocessing function
663 ebcpy.preprocessing.clean_and_space_equally_time_series()
664 See the docstring of this function to know what is happening.
666 :param str desired_freq:
667 Frequency to determine number of elements in processed dataframe.
668 Options are for example:
669 - s: second-based
670 - 5s: Every 5 seconds
671 - 6min: Every 6 minutes
672 This also works for h, d, m, y, ms etc.
673 :param bool inplace:
674 Is Deprecated and use allways the return value!
675 :return: pd.DataFrame
676 Cleaned and equally spaced data-frame
677 """
678 return self.tsd.clean_and_space_equally(desired_freq, inplace)
680 def low_pass_filter(self, crit_freq, filter_order, variable,
681 tag=None, new_tag="low_pass_filter"):
682 """
683 Call to the preprocessing function
684 ebcpy.preprocessing.low_pass_filter()
685 See the docstring of this function to know what is happening.
686 If the old multicolumn format is used, the result is stored in the
687 multicolumn header with the `new_tag`.
689 :param float crit_freq:
690 The critical frequency or frequencies.
691 :param int filter_order:
692 The order of the filter
693 :param str variable:
694 The variable name to apply the filter to
695 :param str tag:
696 If this variable has more than one tag, specify which one
697 :param str new_tag:
698 The new tag to pass to the variable.
699 Default is 'low_pass_filter'
700 """
701 result = self.tsd.low_pass_filter(crit_freq, filter_order, self._possibly_get_variable_and_tag(variable, tag))
702 if self._is_old_multicolumn_format:
703 self.loc[:, (variable, new_tag)] = result
704 else:
705 return result
707 def moving_average(self, window, variable,
708 tag=None, new_tag="moving_average"):
709 """
710 Call to the preprocessing function
711 ebcpy.preprocessing.moving_average()
712 See the docstring of this function to know what is happening.
713 If the old multicolumn format is used, the result is stored in the
714 multicolumn header with the `new_tag`.
716 :param int window:
717 sample rate of input
718 :param str variable:
719 The variable name to apply the filter to
720 :param str tag:
721 If this variable has more than one tag, specify which one
722 :param str new_tag:
723 The new tag to pass to the variable.
724 Default is 'low_pass_filter'
725 """
726 result = self.tsd.moving_average(window, self._possibly_get_variable_and_tag(variable, tag))
727 if self._is_old_multicolumn_format:
728 self.loc[:, (variable, new_tag)] = result
729 else:
730 return result
732 def _possibly_get_variable_and_tag(self, variable: str, tag: str = None):
733 """
734 Helper function to get numpy array based on variable and possible tag name,
735 depending on whether multicolumn is used or not.
737 :param str variable:
738 The variable name to apply the filter to
739 :param str tag:
740 If this variable has more than one tag, specify which one
742 """
743 if tag is None:
744 return variable
745 if self._is_old_multicolumn_format:
746 return (variable, tag)
747 return variable
749 def number_lines_totally_na(self):
750 """
751 Returns the number of rows in the given dataframe
752 that are filled with NaN-values.
753 """
754 return self.tsd.number_lines_totally_na()
756 @property
757 def frequency(self):
758 """
759 The frequency of the time series data.
760 Returns's the mean and the standard deviation of
761 the index.
763 :returns:
764 float: Mean value
765 float: Standard deviation
766 """
767 return self.tsd.frequency
770class TimeSeries(pd.Series):
771 """Overwrites pd.Series to enable correct slicing
772 and expansion in the TimeSeriesData class
774 .. versionadded:: 0.1.7
775 """
777 @property
778 def _constructor(self):
779 """Overwrite constructor method according to:
780 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas"""
781 return TimeSeries
783 @property
784 def _constructor_expanddim(self):
785 """Overwrite constructor method according to:
786 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas"""
787 return TimeSeriesData
790def get_keys_of_hdf_file(filepath):
791 """
792 Find all keys in a given hdf-file.
794 :param str,os.path.normpath filepath:
795 Path to the .hdf-file
796 :return: list
797 List with all keys in the given file.
798 """
799 # pylint: disable=import-outside-toplevel
800 try:
801 import h5py
802 with h5py.File(filepath, 'r') as hdf_file:
803 return list(hdf_file.keys())
804 except ImportError:
805 return ["ERROR: Could not obtain keys as h5py is not installed"]
808def load_time_series_data(data: Union[str, Any], **kwargs) -> pd.DataFrame:
809 """
810 Load time series data from various sources into a pandas DataFrame with
811 custom time series accessor methods available via .tsd property.
813 :param str,os.path.normpath,pd.DataFrame data:
814 Filepath ending with either .hdf, .mat, .csv, .parquet,
815 or .parquet.COMPRESSION_NAME containing
816 time-dependent data to be loaded as a pandas.DataFrame.
817 Alternative option is to pass a DataFrame directly.
818 :keyword str key:
819 Name of the table in a .hdf-file if the file
820 contains multiple tables.
821 :keyword str sep:
822 separator for the use of a csv file. If none is provided,
823 a comma (",") is used as a default value.
824 See pandas.read_csv() docs for further information.
825 :keyword int, list header:
826 Header columns for .csv files.
827 See pandas.read_csv() docs for further information.
828 Default is first row (0).
829 :keyword int,str index_col:
830 Column to be used as index in .csv files.
831 See pandas.read_csv() docs for further information.
832 Default is first column (0).
833 :keyword str sheet_name:
834 Name of the sheet you want to load data from. Required keyword
835 argument when loading a xlsx-file.
836 :keyword str engine:
837 Chose the engine for reading .parquet files. Default is 'pyarrow'
838 Other option is 'fastparquet' (python>=3.9).
839 :keyword list variable_names:
840 List of variable names to load from .mat file. If you
841 know which variables you want to plot, this may speed up
842 loading significantly, and reduce memory size drastically.
843 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.)
844 to match multiple variables at once.
845 :return: pd.DataFrame
846 DataFrame with custom .tsd accessor containing time series functionality
848 Examples:
850 Create a DataFrame with random data:
852 >>> import numpy as np
853 >>> from ebcpy import load_time_series_data
854 >>> df = load_time_series_data({"my_variable": np.random.rand(5)})
855 >>> df.tsd.to_datetime_index()
856 >>> df.tsd.save("my_new_data.csv")
858 Now, let's load the recently created file:
860 >>> df = load_time_series_data("my_new_data.csv")
861 """
862 if isinstance(data, pd.DataFrame):
863 df = data.copy()
864 elif not isinstance(data, (str, Path)):
865 df = pd.DataFrame(data=data,
866 index=kwargs.get("index", None),
867 columns=kwargs.get("columns", None),
868 dtype=kwargs.get("dtype", None),
869 copy=kwargs.get("copy", False))
870 else:
871 # Load from file
872 file = Path(data)
873 df = _load_df_from_file(file=file, **kwargs)
874 df.tsd.filepath = file
876 return df
879def _load_df_from_file(file, **kwargs):
880 """
881 Function to load a given filepath into a dataframe
883 :param Path file: File path to load
884 :param kwargs: Additional loading parameters
885 :return: pd.DataFrame
886 """ # Check whether the file exists
887 if not os.path.isfile(file):
888 raise FileNotFoundError(
889 f"The given filepath {file} could not be opened")
891 # Open based on file suffix.
892 # Currently, hdf, csv, and Modelica result files (mat) are supported.
893 if file.suffix == ".hdf":
894 # Load the current file as a hdf to a dataframe.
895 # As specifying the key can be a problem, the user will
896 # get all keys of the file if one is necessary but not provided.
897 key = kwargs.get("key")
898 if key == "":
899 key = None # Avoid cryptic error in pandas by converting empty string to None
900 try:
901 df = pd.read_hdf(file, key=key)
902 except (ValueError, KeyError) as error:
903 keys = ", ".join(get_keys_of_hdf_file(file))
904 raise KeyError(f"key must be provided when HDF5 file contains multiple datasets. "
905 f"Here are all keys in the given hdf-file: {keys}") from error
906 elif file.suffix == ".csv":
907 # Check if file was previously a TimeSeriesData object
908 with open(file, "r") as _f:
909 lines = [_f.readline() for _ in range(2)]
910 # Backwards compatible assumption: Users never changed '_multi_col_names'
911 if (lines[0].startswith("Variables") and
912 lines[1].startswith("Tags")):
913 _hea_def = [0, 1]
914 else:
915 _hea_def = 0
917 df = pd.read_csv(
918 file,
919 sep=kwargs.get("sep", ","),
920 index_col=kwargs.get("index_col", 0),
921 header=kwargs.get("header", _hea_def)
922 )
923 elif file.suffix == ".mat":
924 df = sr.mat_to_pandas(
925 fname=file,
926 with_unit=False,
927 names=kwargs.get("variable_names")
928 )
929 elif file.suffix in ['.xlsx', '.xls', '.odf', '.ods', '.odt']:
930 sheet_name = kwargs.get("sheet_name")
931 if sheet_name is None:
932 raise KeyError("sheet_name is a required keyword argument to load xlsx-files."
933 "Please pass a string to specify the name "
934 "of the sheet you want to load.")
935 df = pd.read_excel(io=file, sheet_name=sheet_name)
936 elif ".parquet" in file.name:
937 df = pd.read_parquet(path=file, engine=kwargs.get('engine', 'pyarrow'))
938 else:
939 raise TypeError("Only .hdf, .csv, .xlsx and .mat are supported!")
940 if not isinstance(df.index, tuple(datetime_indexes)) and not index_is_numeric(df.index):
941 try:
942 df.index = pd.DatetimeIndex(df.index)
943 except Exception as err:
944 raise IndexError(
945 f"Given data has index of type {type(df.index)}. "
946 f"Currently only numeric indexes and the following are supported:"
947 f"{' ,'.join([str(idx) for idx in [pd.RangeIndex] + datetime_indexes])} "
948 f"Automatic conversion to pd.DateTimeIndex failed"
949 f"see error above."
950 ) from err
951 return df