Coverage for ebcpy/data_types.py: 96%
249 statements
« prev ^ index » next coverage.py v7.4.4, created at 2026-04-20 13:20 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2026-04-20 13:20 +0000
1"""
2This module provides useful classes for all ebcpy.
3Every data_type class should include every parameter
4other classes like optimization etc. may need. The checking
5of correct input is especially relevant here as the correct
6format of data-types will prevent errors during simulations,
7optimization etc.
8"""
10import os
11import warnings
12from pathlib import Path
13from typing import List, Union, Any, TYPE_CHECKING
14from datetime import datetime
15from pandas.core.internals import BlockManager
16import pandas as pd
17import numpy as np
18import ebcpy.modelica.simres as sr
20from ebcpy.utils import get_names
21from ebcpy import preprocessing
23# pylint: disable=I1101
24# pylint: disable=too-many-ancestors
26__all__ = ['TimeSeries',
27 'TimeSeriesData',
28 'numeric_index_dtypes',
29 'load_time_series_data',
30 'index_is_numeric',
31 'datetime_indexes']
33numeric_index_dtypes = [
34 pd.Index([], dtype=dtype).dtype for dtype in
35 ["int8", "int16", "int32", "int64",
36 "uint8", "uint16", "uint32", "uint64",
37 "float32", "float64"]
38]
40datetime_indexes = [
41 pd.DatetimeIndex
42]
45def index_is_numeric(index: pd.Index):
46 """Check if pandas Index is numeric"""
47 return isinstance(index, pd.RangeIndex) or index.dtype in numeric_index_dtypes
50@pd.api.extensions.register_dataframe_accessor("tsd")
51class TimeSeriesAccessor:
52 """
53 Pandas DataFrame accessor for time series functionality.
54 Access using df.tsd.*
55 """
57 def __init__(self, pandas_obj):
58 self._obj = pandas_obj
59 self._filepath = None
61 @property
62 def filepath(self):
63 """Get the filepath associated with the time series data"""
64 return self._filepath
66 @filepath.setter
67 def filepath(self, filepath):
68 """Set the filepath associated with the time series data"""
69 self._filepath = Path(filepath) if filepath else None
71 def save(self, filepath: str = None, **kwargs) -> None:
72 """
73 Save the current time-series-data into the given file-format.
74 Currently supported are .hdf, which is an easy and fast storage,
75 and, .csv is supported as an easy-readable option.
76 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME
77 are supported.
79 :param str,os.path.normpath filepath:
80 Filepath were to store the data. Either .hdf, .csv, .parquet
81 or .parquet.COMPRESSION_NAME has to be the file-ending.
82 Default is current filepath of class.
83 :keyword str key:
84 Necessary keyword-argument for saving a .hdf-file.
85 Specifies the key of the table in the .hdf-file.
86 :keyword str sep:
87 Separator used for saving as .csv. Default is ','.
88 :keyword str engine:
89 Chose the engine for reading .parquet files. Default is 'pyarrow'
90 Other option is 'fastparquet' (python>=3.9).
91 """
92 # Set filepath if not given
93 if filepath is None:
94 if self.filepath is None:
95 raise FileNotFoundError(
96 "TimeSeriesData has neither a filepath stored in tsd "
97 "accessor nor did you provide a filepath were to store the data."
98 )
99 filepath = self.filepath
100 else:
101 filepath = Path(filepath)
103 # Check if filepath is still None
104 if filepath is None:
105 raise ValueError("No filepath specified and no default filepath is set.")
107 # Save based on file suffix
108 if filepath.suffix == ".hdf":
109 if "key" not in kwargs:
110 raise KeyError("Argument 'key' must be specified to save a .hdf file")
111 self._obj.to_hdf(filepath, key=kwargs.get("key"))
112 elif filepath.suffix == ".csv":
113 self._obj.to_csv(filepath, sep=kwargs.get("sep", ","))
114 elif ".parquet" in filepath.name:
115 parquet_split = filepath.name.split(".parquet")
116 self._obj.to_parquet(
117 filepath, engine=kwargs.get('engine', 'pyarrow'),
118 compression=parquet_split[-1][1:] if parquet_split[-1] else None,
119 index=True
120 )
121 else:
122 raise TypeError("Given file-format is not supported."
123 "You can only store time series data as .hdf, .csv, .parquet, "
124 "and .parquet.COMPRESSION_NAME with additional compression options")
126 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace=True):
127 """
128 Convert the current index to a datetime index using
129 ebcpy.preprocessing.convert_index_to_datetime_index()
131 :param str unit_of_index: default 's'
132 The unit of the given index. Used to convert to
133 total_seconds later on.
134 :param datetime.datetime origin:
135 The reference datetime object for the first index.
136 Default is the current system time.
137 :param bool inplace:
138 If True, performs operation inplace and returns None.
139 :return: df
140 Copy of DataFrame with correct index for usage in this
141 framework.
142 """
143 return preprocessing.convert_index_to_datetime_index(
144 df=self._obj,
145 unit_of_index=unit_of_index,
146 origin=origin,
147 inplace=inplace
148 )
150 def to_float_index(self, offset=0, inplace=True):
151 """
152 Convert the current index to a float based index using
153 ebcpy.preprocessing.convert_datetime_index_to_float_index()
155 :param float offset:
156 Offset in seconds
157 :param bool inplace:
158 If True, performs operation inplace and returns None.
159 :return: pd.DataFrame df:
160 DataFrame with correct index.
161 """
162 if not isinstance(self._obj.index, pd.DatetimeIndex):
163 if inplace:
164 return None
165 return self._obj
167 return preprocessing.convert_datetime_index_to_float_index(
168 df=self._obj,
169 offset=offset,
170 inplace=inplace
171 )
173 def clean_and_space_equally(self, desired_freq, inplace=False):
174 """
175 Call to the preprocessing function
176 ebcpy.preprocessing.clean_and_space_equally_time_series()
177 See the docstring of this function to know what is happening.
179 :param str desired_freq:
180 Frequency to determine number of elements in processed dataframe.
181 Options are for example:
182 - s: second-based
183 - 5s: Every 5 seconds
184 - 6min: Every 6 minutes
185 This also works for h, d, m, y, ms etc.
186 :param bool inplace:
187 Is Deprecated and use allways the return value!
188 :return: pd.DataFrame
189 Cleaned and equally spaced data-frame
190 """
191 df = preprocessing.clean_and_space_equally_time_series(
192 df=self._obj,
193 desired_freq=desired_freq
194 )
195 if inplace:
196 warnings.warn(
197 "inplace=True on clean_and_space_equally has no effect when called "
198 "via the .tsd accessor. Use the return value instead: "
199 "df = df.tsd.clean_and_space_equally(freq, inplace=False)",
200 FutureWarning,
201 stacklevel=2,
202 )
203 return df
205 def low_pass_filter(self, crit_freq, filter_order, variable):
206 """
207 Call to the preprocessing function
208 ebcpy.preprocessing.low_pass_filter()
209 See the docstring of this function to know what is happening.
211 :param float crit_freq:
212 The critical frequency or frequencies.
213 :param int filter_order:
214 The order of the filter
215 :param str variable:
216 The variable name to apply the filter to
217 :return: numpy.ndarray
218 Filtered data
219 """
220 return preprocessing.low_pass_filter(
221 data=self._obj[variable].to_numpy(),
222 filter_order=filter_order,
223 crit_freq=crit_freq
224 )
226 def moving_average(self, window, variable):
227 """
228 Call to the preprocessing function
229 ebcpy.preprocessing.moving_average()
230 See the docstring of this function to know what is happening.
232 :param int window:
233 sample rate of input
234 :param str variable:
235 The variable name to apply the filter to
236 :return: numpy.ndarray
237 Moving average result
238 """
239 return preprocessing.moving_average(
240 data=self._obj[variable].to_numpy(),
241 window=window,
242 )
244 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]:
245 """
246 Return an alphabetically sorted list of variable names, optionally filtered by patterns.
248 By default, returns all column names found in the DataFrame, sorted alphabetically.
249 If `patterns` is provided, only names matching one or more of the given
250 literal strings or glob-style patterns (where `*` matches any sequence of characters)
251 will be returned.
253 :param patterns:
254 - A single string or list of strings.
255 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard.
256 - If None, all variable names are returned.
257 :return:
258 A list of matching variable names, in alphabetical order.
259 :raises KeyError:
260 If any literal name or pattern does not match at least one variable in the DataFrame.
262 Example:
263 # return all wall temperatures at any layer
264 df.tsd.get_variable_names("*wall.layer[*].T")
265 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"]
266 """
267 all_names = sorted(self._obj.columns.get_level_values(0).unique())
268 if patterns is None:
269 return all_names
270 return get_names(all_names, patterns)
272 def number_lines_totally_na(self):
273 """
274 Returns the number of rows in the given dataframe
275 that are filled with NaN-values.
276 """
277 return preprocessing.number_lines_totally_na(self._obj)
279 @property
280 def frequency(self):
281 """
282 The frequency of the time series data.
283 Returns's the mean and the standard deviation of
284 the index.
286 :returns:
287 float: Mean value
288 float: Standard deviation
289 """
290 return preprocessing.get_df_index_frequency_mean_and_std(
291 df_index=self._obj.index
292 )
295class TimeSeriesData(pd.DataFrame):
296 """
297 Most data related to energy and building
298 climate related problems is time-variant.
300 Class for handling time series data using a pandas dataframe.
301 This class works file-based and makes the import of different
302 file-types into a pandas DataFrame more user-friendly.
303 Furthermore, functions to support multi-indexing are provided to
304 efficiently handle variable passed processing and provide easy
305 visualization and preprocessing access.
307 :param str,os.path.normpath,pd.DataFrame data:
308 Filepath ending with either .hdf, .mat, .csv, .parquet,
309 or .parquet.COMPRESSION_NAME containing
310 time-dependent data to be loaded as a pandas.DataFrame.
311 Alternative option is to pass a DataFrame directly.
312 :keyword str key:
313 Name of the table in a .hdf-file if the file
314 contains multiple tables.
315 :keyword str sep:
316 separator for the use of a csv file. If none is provided,
317 a comma (",") is used as a default value.
318 See pandas.read_csv() docs for further information.
319 :keyword int, list header:
320 Header columns for .csv files.
321 See pandas.read_csv() docs for further information.
322 Default is first row (0).
323 :keyword int,str index_col:
324 Column to be used as index in .csv files.
325 See pandas.read_csv() docs for further information.
326 Default is first column (0).
327 :keyword str sheet_name:
328 Name of the sheet you want to load data from. Required keyword
329 argument when loading a xlsx-file.
330 :keyword str default_tag:
331 Which value to use as tag. Default is 'raw'
332 :keyword str engine:
333 Chose the engine for reading .parquet files. Default is 'pyarrow'
334 Other option is 'fastparquet' (python>=3.9).
335 :keyword list variable_names:
336 List of variable names to load from .mat file. If you
337 know which variables you want to plot, this may speed up
338 loading significantly, and reduce memory size drastically.
339 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.)
340 to match multiple variables at once.
342 Examples:
344 First let's see the usage for a common dataframe.
346 >>> import numpy as np
347 >>> from ebcpy import TimeSeriesData
348 >>> tsd = TimeSeriesData({"my_variable": np.random.rand(5)})
349 >>> tsd.to_datetime_index()
350 >>> tsd.save("my_new_data.csv")
352 Now, let's load the recently created file.
354 >>> tsd = TimeSeriesData("my_new_data.csv")
355 """
357 # normal properties
358 _metadata = [
359 "_filepath",
360 "_loader_kwargs",
361 "_default_tag",
362 "_multi_col_names"
363 ]
365 def __init__(self, data: Union[str, Any], use_multicolumn: bool = False, **kwargs):
366 """Initialize class-objects and check correct input."""
367 warnings.warn(
368 "TimeSeriesData will be deprecated in the next major release. "
369 "Instead, use 'load_time_series_data' to load files etc. as pd.DataFrame "
370 "and use the 'tsd' accessor to access useful time-series-related functions "
371 "as before with TimeSeriesData.", FutureWarning
372 )
373 if use_multicolumn:
374 warnings.warn(
375 "All multicolumn support will be removed in the next major release", FutureWarning
376 )
377 # Initialize as default
378 self._filepath = None
379 self._loader_kwargs = {}
380 self._multi_col_names = ["Variables", "Tags"]
381 self._default_tag = kwargs.pop("default_tag", "raw")
382 if not isinstance(self._default_tag, str):
383 raise TypeError(f"Invalid type for default_tag! Expected 'str' but "
384 f"received {type(self._default_tag)}")
386 # Two possibles inputs. first argument is actually data provided by pandas
387 # and kwargs hold further information or is it an actual filepath.
388 if isinstance(data, BlockManager):
389 super().__init__(data=data)
390 return
392 if not isinstance(data, (str, Path)):
393 _df_loaded = pd.DataFrame(data=data,
394 index=kwargs.get("index", None),
395 columns=kwargs.get("columns", None),
396 dtype=kwargs.get("dtype", None),
397 copy=kwargs.get("copy", False))
398 else:
399 file = Path(data)
400 self._loader_kwargs = kwargs.copy()
401 _df_loaded = _load_df_from_file(file=file, **self._loader_kwargs)
402 _df_loaded.tsd.filepath = file
403 self._filepath = file
405 if _df_loaded.columns.nlevels == 1:
406 # Check if first level is named Tags.
407 # If so, don't create MultiIndex-DF as the method is called by the pd constructor
408 if _df_loaded.columns.name != self._multi_col_names[1] and use_multicolumn:
409 multi_col = pd.MultiIndex.from_product(
410 [_df_loaded.columns, [self._default_tag]],
411 names=self._multi_col_names
412 )
413 _df_loaded.columns = multi_col
415 elif _df_loaded.columns.nlevels == 2:
416 if _df_loaded.columns.names != self._multi_col_names and use_multicolumn:
417 raise TypeError("Loaded dataframe has a different 2-Level "
418 "header format than it is supported by this "
419 "class. The names have to match.")
420 else:
421 raise TypeError("Only DataFrames with Multi-Columns with 2 "
422 "Levels are supported by this class.")
424 super().__init__(_df_loaded)
426 @property
427 def _constructor(self):
428 """Overwrite constructor method according to:
429 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas"""
430 return TimeSeriesData
432 @property
433 def _constructor_sliced(self):
434 """Overwrite constructor method according to:
435 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas"""
436 return TimeSeries
438 @property
439 def filepath(self) -> str:
440 """Get the filepath associated with the time series data"""
441 return self._filepath
443 @filepath.setter
444 def filepath(self, filepath: str):
445 """Set the filepath associated with the time series data"""
446 self._filepath = Path(filepath)
447 self.tsd.filepath = self._filepath
449 @property
450 def default_tag(self) -> str:
451 """Get the default of time series data object"""
452 return self._default_tag
454 @default_tag.setter
455 def default_tag(self, tag: str) -> None:
456 """Set the default_tag of the time series data object
457 :param tag: new tag
458 :type tag: String
459 """
460 if not isinstance(tag, str):
461 raise TypeError(f"Invalid type for default_tag! Expected 'str' but "
462 f"received {type(tag)}")
463 if tag not in self.get_tags():
464 raise KeyError(f"Tag '{tag}' does not exist for current data set!"
465 f"\n Available tags: {self.get_tags()}")
466 self._default_tag = tag
468 def save(self, filepath: str = None, **kwargs) -> None:
469 """
470 Save the current time-series-data into the given file-format.
471 Currently supported are .hdf, which is an easy and fast storage,
472 and, .csv is supported as an easy-readable option.
473 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME
474 are supported. Compressions could be gzip, brotli or snappy. For all possible
475 compressions see the documentation of the parquet engines.
476 For a small comparison of these data formats see https://github.com/RWTH-EBC/ebcpy/issues/81
478 :param str,os.path.normpath filepath:
479 Filepath were to store the data. Either .hdf, .csv, .parquet
480 or .parquet.COMPRESSION_NAME has to be the file-ending.
481 Default is current filepath of class.
482 :keyword str key:
483 Necessary keyword-argument for saving a .hdf-file.
484 Specifies the key of the table in the .hdf-file.
485 :keyword str sep:
486 Separator used for saving as .csv. Default is ','.
487 :keyword str engine:
488 Chose the engine for reading .parquet files. Default is 'pyarrow'
489 Other option is 'fastparquet' (python>=3.9).
490 :return:
491 """
492 # If new settings are needed, update existing ones
493 self._loader_kwargs.update(kwargs)
494 self.tsd.save(filepath, **kwargs)
496 def to_df(self, force_single_index=False):
497 """
498 Return the dataframe version of the current TimeSeriesData object.
499 If all tags are equal, the tags are dropped.
500 Else, the object is just converted.
502 :param bool force_single_index:
503 If True (not the default), the conversion to a standard
504 DataFrame with a single index column (only variable names)
505 is only done if no variable contains multiple tags.
506 """
507 if len(self.get_variables_with_multiple_tags()) == 0:
508 if self._is_old_multicolumn_format:
509 return pd.DataFrame(self.droplevel(1, axis=1))
510 return pd.DataFrame(self)
511 if force_single_index:
512 raise IndexError(
513 "Can't automatically drop all tags "
514 "as the following variables contain multiple tags: "
515 f"{' ,'.join(self.get_variables_with_multiple_tags())}. "
516 )
517 return pd.DataFrame(self)
519 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]:
520 """
521 Return an alphabetically sorted list of variable names, optionally filtered by patterns.
523 By default, returns all variable names found in the first level of the DataFrame's
524 column MultiIndex, sorted alphabetically. If `patterns` is provided, only names
525 matching one or more of the given literal strings or glob-style patterns
526 (where `*` matches any sequence of characters) will be returned.
528 :param patterns:
529 - A single string or list of strings.
530 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard.
531 - If None, all variable names are returned.
532 :return:
533 A list of matching variable names, in alphabetical order.
534 :raises KeyError:
535 If any literal name or pattern does not match at least one variable in the DataFrame.
537 Example:
538 # return all wall temperatures at any layer
539 tsd.get_variable_names("*wall.layer[*].T")
540 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"]
541 """
542 return self.tsd.get_variable_names(patterns)
544 def get_variables_with_multiple_tags(self) -> List[str]:
545 """
546 Return an alphabetically sorted list of all variables
547 that contain more than one tag.
549 :return: List[str]
550 """
551 var_names = self.columns.get_level_values(0)
552 return sorted(var_names[var_names.duplicated()])
554 def get_tags(self, variable: str = None) -> List[str]:
555 """
556 Return an alphabetically sorted list of all tags
558 :param str variable:
559 If given, tags of this variable are returned
561 :return: List[str]
562 """
563 if not self._is_old_multicolumn_format:
564 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!")
565 if variable:
566 tags = self.loc[:, variable].columns
567 return sorted(tags)
568 return sorted(self.columns.get_level_values(1).unique())
570 @property
571 def _is_old_multicolumn_format(self):
572 """
573 Helper function to check if the old multicolumn format is used.
574 """
575 return isinstance(self.columns, pd.MultiIndex)
577 def get_columns_by_tag(self,
578 tag: str,
579 variables: list = None,
580 return_type: str = 'pandas',
581 drop_level: bool = False):
582 """
583 Returning all columns with defined tag in the form of ndarray.
585 :param str tag:
586 Define the tag which return columns have to
587 match.
588 :param list variables:
589 Besides the given tag, specify the
590 variables names matching the return criteria as well.
591 :param boolean drop_level:
592 If tag should be included in the response.
593 Default is True.
594 :param str return_type:
595 Return format. Options are:
596 - pandas (pd.series)
597 - numpy, scipy, sp, and np (np.array)
598 - control (transposed np.array)
599 :return: ndarray of input signals
600 """
601 if not self._is_old_multicolumn_format:
602 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!")
604 # Extract columns
605 if variables:
606 _ret = self.loc[:, variables]
607 else:
608 _ret = self
610 _ret = _ret.xs(tag, axis=1, level=1, drop_level=drop_level)
612 # Return based on the given return_type
613 if return_type.lower() == 'pandas':
614 return _ret
615 if return_type.lower() in ['numpy', 'scipy', 'sp', 'np']:
616 return _ret.to_numpy()
617 if return_type.lower() == 'control':
618 return _ret.to_numpy().transpose()
619 raise TypeError("Unknown return type")
621 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace: bool = True):
622 """
623 Convert the current index to a float based index using
624 ebcpy.preprocessing.convert_index_to_datetime_index()
626 :param str unit_of_index: default 's'
627 The unit of the given index. Used to convert to
628 total_seconds later on.
629 :param datetime.datetime origin:
630 The reference datetime object for the first index.
631 Default is the current system time.
632 :param bool inplace:
633 If True, performs operation inplace and returns None.
634 :return: df
635 Copy of DataFrame with correct index for usage in this
636 framework.
638 """
639 return self.tsd.to_datetime_index(unit_of_index, origin, inplace)
641 def to_float_index(self, offset=0, inplace: bool = True):
642 """
643 Convert the current index to a float based index using
644 ebcpy.preprocessing.convert_datetime_index_to_float_index()
646 :param float offset:
647 Offset in seconds
648 :param bool inplace:
649 If True, performs operation inplace and returns None.
650 :return: pd.DataFrame df:
651 DataFrame with correct index.
652 """
653 return self.tsd.to_float_index(offset, inplace)
655 def clean_and_space_equally(self, desired_freq, inplace: bool = False):
656 """
657 Call to the preprocessing function
658 ebcpy.preprocessing.clean_and_space_equally_time_series()
659 See the docstring of this function to know what is happening.
661 :param str desired_freq:
662 Frequency to determine number of elements in processed dataframe.
663 Options are for example:
664 - s: second-based
665 - 5s: Every 5 seconds
666 - 6min: Every 6 minutes
667 This also works for h, d, m, y, ms etc.
668 :param bool inplace:
669 Is Deprecated and use allways the return value!
670 :return: pd.DataFrame
671 Cleaned and equally spaced data-frame
672 """
673 return self.tsd.clean_and_space_equally(desired_freq, inplace)
675 def low_pass_filter(self, crit_freq, filter_order, variable,
676 tag=None, new_tag="low_pass_filter"):
677 """
678 Call to the preprocessing function
679 ebcpy.preprocessing.low_pass_filter()
680 See the docstring of this function to know what is happening.
681 If the old multicolumn format is used, the result is stored in the
682 multicolumn header with the `new_tag`.
684 :param float crit_freq:
685 The critical frequency or frequencies.
686 :param int filter_order:
687 The order of the filter
688 :param str variable:
689 The variable name to apply the filter to
690 :param str tag:
691 If this variable has more than one tag, specify which one
692 :param str new_tag:
693 The new tag to pass to the variable.
694 Default is 'low_pass_filter'
695 """
696 result = self.tsd.low_pass_filter(crit_freq, filter_order, self._possibly_get_variable_and_tag(variable, tag))
697 if self._is_old_multicolumn_format:
698 self.loc[:, (variable, new_tag)] = result
699 else:
700 return result
702 def moving_average(self, window, variable,
703 tag=None, new_tag="moving_average"):
704 """
705 Call to the preprocessing function
706 ebcpy.preprocessing.moving_average()
707 See the docstring of this function to know what is happening.
708 If the old multicolumn format is used, the result is stored in the
709 multicolumn header with the `new_tag`.
711 :param int window:
712 sample rate of input
713 :param str variable:
714 The variable name to apply the filter to
715 :param str tag:
716 If this variable has more than one tag, specify which one
717 :param str new_tag:
718 The new tag to pass to the variable.
719 Default is 'low_pass_filter'
720 """
721 result = self.tsd.moving_average(window, self._possibly_get_variable_and_tag(variable, tag))
722 if self._is_old_multicolumn_format:
723 self.loc[:, (variable, new_tag)] = result
724 else:
725 return result
727 def _possibly_get_variable_and_tag(self, variable: str, tag: str = None):
728 """
729 Helper function to get numpy array based on variable and possible tag name,
730 depending on whether multicolumn is used or not.
732 :param str variable:
733 The variable name to apply the filter to
734 :param str tag:
735 If this variable has more than one tag, specify which one
737 """
738 if tag is None:
739 return variable
740 if self._is_old_multicolumn_format:
741 return (variable, tag)
742 return variable
744 def number_lines_totally_na(self):
745 """
746 Returns the number of rows in the given dataframe
747 that are filled with NaN-values.
748 """
749 return self.tsd.number_lines_totally_na()
751 @property
752 def frequency(self):
753 """
754 The frequency of the time series data.
755 Returns's the mean and the standard deviation of
756 the index.
758 :returns:
759 float: Mean value
760 float: Standard deviation
761 """
762 return self.tsd.frequency
765class TimeSeries(pd.Series):
766 """Overwrites pd.Series to enable correct slicing
767 and expansion in the TimeSeriesData class
769 .. versionadded:: 0.1.7
770 """
772 @property
773 def _constructor(self):
774 """Overwrite constructor method according to:
775 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas"""
776 return TimeSeries
778 @property
779 def _constructor_expanddim(self):
780 """Overwrite constructor method according to:
781 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas"""
782 return TimeSeriesData
785def get_keys_of_hdf_file(filepath):
786 """
787 Find all keys in a given hdf-file.
789 :param str,os.path.normpath filepath:
790 Path to the .hdf-file
791 :return: list
792 List with all keys in the given file.
793 """
794 # pylint: disable=import-outside-toplevel
795 try:
796 import h5py
797 with h5py.File(filepath, 'r') as hdf_file:
798 return list(hdf_file.keys())
799 except ImportError:
800 return ["ERROR: Could not obtain keys as h5py is not installed"]
803def load_time_series_data(data: Union[str, Any], **kwargs) -> pd.DataFrame:
804 """
805 Load time series data from various sources into a pandas DataFrame with
806 custom time series accessor methods available via .tsd property.
808 :param str,os.path.normpath,pd.DataFrame data:
809 Filepath ending with either .hdf, .mat, .csv, .parquet,
810 or .parquet.COMPRESSION_NAME containing
811 time-dependent data to be loaded as a pandas.DataFrame.
812 Alternative option is to pass a DataFrame directly.
813 :keyword str key:
814 Name of the table in a .hdf-file if the file
815 contains multiple tables.
816 :keyword str sep:
817 separator for the use of a csv file. If none is provided,
818 a comma (",") is used as a default value.
819 See pandas.read_csv() docs for further information.
820 :keyword int, list header:
821 Header columns for .csv files.
822 See pandas.read_csv() docs for further information.
823 Default is first row (0).
824 :keyword int,str index_col:
825 Column to be used as index in .csv files.
826 See pandas.read_csv() docs for further information.
827 Default is first column (0).
828 :keyword str sheet_name:
829 Name of the sheet you want to load data from. Required keyword
830 argument when loading a xlsx-file.
831 :keyword str engine:
832 Chose the engine for reading .parquet files. Default is 'pyarrow'
833 Other option is 'fastparquet' (python>=3.9).
834 :keyword list variable_names:
835 List of variable names to load from .mat file. If you
836 know which variables you want to plot, this may speed up
837 loading significantly, and reduce memory size drastically.
838 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.)
839 to match multiple variables at once.
840 :return: pd.DataFrame
841 DataFrame with custom .tsd accessor containing time series functionality
843 Examples:
845 Create a DataFrame with random data:
847 >>> import numpy as np
848 >>> from ebcpy import load_time_series_data
849 >>> df = load_time_series_data({"my_variable": np.random.rand(5)})
850 >>> df.tsd.to_datetime_index()
851 >>> df.tsd.save("my_new_data.csv")
853 Now, let's load the recently created file:
855 >>> df = load_time_series_data("my_new_data.csv")
856 """
857 if isinstance(data, pd.DataFrame):
858 df = data.copy()
859 elif not isinstance(data, (str, Path)):
860 df = pd.DataFrame(data=data,
861 index=kwargs.get("index", None),
862 columns=kwargs.get("columns", None),
863 dtype=kwargs.get("dtype", None),
864 copy=kwargs.get("copy", False))
865 else:
866 # Load from file
867 file = Path(data)
868 df = _load_df_from_file(file=file, **kwargs)
869 df.tsd.filepath = file
871 return df
874def _load_df_from_file(file, **kwargs):
875 """
876 Function to load a given filepath into a dataframe
878 :param Path file: File path to load
879 :param kwargs: Additional loading parameters
880 :return: pd.DataFrame
881 """ # Check whether the file exists
882 if not os.path.isfile(file):
883 raise FileNotFoundError(
884 f"The given filepath {file} could not be opened")
886 # Open based on file suffix.
887 # Currently, hdf, csv, and Modelica result files (mat) are supported.
888 if file.suffix == ".hdf":
889 # Load the current file as a hdf to a dataframe.
890 # As specifying the key can be a problem, the user will
891 # get all keys of the file if one is necessary but not provided.
892 key = kwargs.get("key")
893 if key == "":
894 key = None # Avoid cryptic error in pandas by converting empty string to None
895 try:
896 df = pd.read_hdf(file, key=key)
897 except (ValueError, KeyError) as error:
898 keys = ", ".join(get_keys_of_hdf_file(file))
899 raise KeyError(f"key must be provided when HDF5 file contains multiple datasets. "
900 f"Here are all keys in the given hdf-file: {keys}") from error
901 elif file.suffix == ".csv":
902 # Check if file was previously a TimeSeriesData object
903 with open(file, "r") as _f:
904 lines = [_f.readline() for _ in range(2)]
905 # Backwards compatible assumption: Users never changed '_multi_col_names'
906 if (lines[0].startswith("Variables") and
907 lines[1].startswith("Tags")):
908 _hea_def = [0, 1]
909 else:
910 _hea_def = 0
912 df = pd.read_csv(
913 file,
914 sep=kwargs.get("sep", ","),
915 index_col=kwargs.get("index_col", 0),
916 header=kwargs.get("header", _hea_def)
917 )
918 elif file.suffix == ".mat":
919 df = sr.mat_to_pandas(
920 fname=file,
921 with_unit=False,
922 names=kwargs.get("variable_names")
923 )
924 elif file.suffix in ['.xlsx', '.xls', '.odf', '.ods', '.odt']:
925 sheet_name = kwargs.get("sheet_name")
926 if sheet_name is None:
927 raise KeyError("sheet_name is a required keyword argument to load xlsx-files."
928 "Please pass a string to specify the name "
929 "of the sheet you want to load.")
930 df = pd.read_excel(io=file, sheet_name=sheet_name)
931 elif ".parquet" in file.name:
932 df = pd.read_parquet(path=file, engine=kwargs.get('engine', 'pyarrow'))
933 else:
934 raise TypeError("Only .hdf, .csv, .xlsx and .mat are supported!")
935 if not isinstance(df.index, tuple(datetime_indexes)) and not index_is_numeric(df.index):
936 try:
937 df.index = pd.DatetimeIndex(df.index)
938 except Exception as err:
939 raise IndexError(
940 f"Given data has index of type {type(df.index)}. "
941 f"Currently only numeric indexes and the following are supported:"
942 f"{' ,'.join([str(idx) for idx in [pd.RangeIndex] + datetime_indexes])} "
943 f"Automatic conversion to pd.DateTimeIndex failed"
944 f"see error above."
945 ) from err
946 return df