Coverage for ebcpy/data_types.py: 96%

253 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2026-05-29 13:01 +0000

1""" 

2This module provides useful classes for all ebcpy. 

3Every data_type class should include every parameter 

4other classes like optimization etc. may need. The checking 

5of correct input is especially relevant here as the correct 

6format of data-types will prevent errors during simulations, 

7optimization etc. 

8""" 

9 

10import os 

11import warnings 

12from pathlib import Path 

13from typing import List, Union, Any, TYPE_CHECKING 

14from datetime import datetime 

15from pandas.core.internals import BlockManager 

16import pandas as pd 

17import numpy as np 

18import ebcpy.modelica.simres as sr 

19 

20from ebcpy.utils import get_names 

21from ebcpy import preprocessing 

22 

23# pylint: disable=I1101 

24# pylint: disable=too-many-ancestors 

25 

26__all__ = ['TimeSeries', 

27 'TimeSeriesData', 

28 'numeric_index_dtypes', 

29 'load_time_series_data', 

30 'index_is_numeric', 

31 'datetime_indexes'] 

32 

33numeric_index_dtypes = [ 

34 pd.Index([], dtype=dtype).dtype for dtype in 

35 ["int8", "int16", "int32", "int64", 

36 "uint8", "uint16", "uint32", "uint64", 

37 "float32", "float64"] 

38] 

39 

40datetime_indexes = [ 

41 pd.DatetimeIndex 

42] 

43 

44 

45def index_is_numeric(index: pd.Index): 

46 """Check if pandas Index is numeric""" 

47 return isinstance(index, pd.RangeIndex) or index.dtype in numeric_index_dtypes 

48 

49 

50@pd.api.extensions.register_dataframe_accessor("tsd") 

51class TimeSeriesAccessor: 

52 """ 

53 Pandas DataFrame accessor for time series functionality. 

54 Access using df.tsd.* 

55 """ 

56 

57 def __init__(self, pandas_obj): 

58 self._obj = pandas_obj 

59 self._filepath = None 

60 

61 @property 

62 def filepath(self): 

63 """Get the filepath associated with the time series data""" 

64 return self._filepath 

65 

66 @filepath.setter 

67 def filepath(self, filepath): 

68 """Set the filepath associated with the time series data""" 

69 self._filepath = Path(filepath) if filepath else None 

70 

71 def save(self, filepath: str = None, **kwargs) -> None: 

72 """ 

73 Save the current time-series-data into the given file-format. 

74 Currently supported are .hdf, which is an easy and fast storage, 

75 and, .csv is supported as an easy-readable option. 

76 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME 

77 are supported. 

78 

79 :param str,os.path.normpath filepath: 

80 Filepath were to store the data. Either .hdf, .csv, .parquet 

81 or .parquet.COMPRESSION_NAME has to be the file-ending. 

82 Default is current filepath of class. 

83 :keyword str key: 

84 Necessary keyword-argument for saving a .hdf-file. 

85 Specifies the key of the table in the .hdf-file. 

86 :keyword str sep: 

87 Separator used for saving as .csv. Default is ','. 

88 :keyword str engine: 

89 Chose the engine for reading .parquet files. Default is 'pyarrow' 

90 Other option is 'fastparquet' (python>=3.9). 

91 """ 

92 # Set filepath if not given 

93 if filepath is None: 

94 if self.filepath is None: 

95 raise FileNotFoundError( 

96 "TimeSeriesData has neither a filepath stored in tsd " 

97 "accessor nor did you provide a filepath were to store the data." 

98 ) 

99 filepath = self.filepath 

100 else: 

101 filepath = Path(filepath) 

102 

103 # Check if filepath is still None 

104 if filepath is None: 

105 raise ValueError("No filepath specified and no default filepath is set.") 

106 

107 # Save based on file suffix 

108 if filepath.suffix == ".hdf": 

109 if "key" not in kwargs: 

110 raise KeyError("Argument 'key' must be specified to save a .hdf file") 

111 self._obj.to_hdf(filepath, key=kwargs.get("key")) 

112 elif filepath.suffix == ".csv": 

113 self._obj.to_csv(filepath, sep=kwargs.get("sep", ",")) 

114 elif ".parquet" in filepath.name: 

115 parquet_split = filepath.name.split(".parquet") 

116 # Parquet doesn't support SparseDtype — densify before writing 

117 df_to_save = self._obj.copy() 

118 for col in df_to_save.columns: 

119 if isinstance(df_to_save[col].dtype, pd.SparseDtype): 

120 df_to_save[col] = df_to_save[col].sparse.to_dense() 

121 df_to_save.to_parquet( 

122 filepath, engine=kwargs.get('engine', 'pyarrow'), 

123 compression=parquet_split[-1][1:] if parquet_split[-1] else None, 

124 index=True 

125 ) 

126 else: 

127 raise TypeError("Given file-format is not supported." 

128 "You can only store time series data as .hdf, .csv, .parquet, " 

129 "and .parquet.COMPRESSION_NAME with additional compression options") 

130 

131 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace=True): 

132 """ 

133 Convert the current index to a datetime index using 

134 ebcpy.preprocessing.convert_index_to_datetime_index() 

135 

136 :param str unit_of_index: default 's' 

137 The unit of the given index. Used to convert to 

138 total_seconds later on. 

139 :param datetime.datetime origin: 

140 The reference datetime object for the first index. 

141 Default is the current system time. 

142 :param bool inplace: 

143 If True, performs operation inplace and returns None. 

144 :return: df 

145 Copy of DataFrame with correct index for usage in this 

146 framework. 

147 """ 

148 return preprocessing.convert_index_to_datetime_index( 

149 df=self._obj, 

150 unit_of_index=unit_of_index, 

151 origin=origin, 

152 inplace=inplace 

153 ) 

154 

155 def to_float_index(self, offset=0, inplace=True): 

156 """ 

157 Convert the current index to a float based index using 

158 ebcpy.preprocessing.convert_datetime_index_to_float_index() 

159 

160 :param float offset: 

161 Offset in seconds 

162 :param bool inplace: 

163 If True, performs operation inplace and returns None. 

164 :return: pd.DataFrame df: 

165 DataFrame with correct index. 

166 """ 

167 if not isinstance(self._obj.index, pd.DatetimeIndex): 

168 if inplace: 

169 return None 

170 return self._obj 

171 

172 return preprocessing.convert_datetime_index_to_float_index( 

173 df=self._obj, 

174 offset=offset, 

175 inplace=inplace 

176 ) 

177 

178 def clean_and_space_equally(self, desired_freq, inplace=False): 

179 """ 

180 Call to the preprocessing function 

181 ebcpy.preprocessing.clean_and_space_equally_time_series() 

182 See the docstring of this function to know what is happening. 

183 

184 :param str desired_freq: 

185 Frequency to determine number of elements in processed dataframe. 

186 Options are for example: 

187 - s: second-based 

188 - 5s: Every 5 seconds 

189 - 6min: Every 6 minutes 

190 This also works for h, d, m, y, ms etc. 

191 :param bool inplace: 

192 Is Deprecated and use allways the return value! 

193 :return: pd.DataFrame 

194 Cleaned and equally spaced data-frame 

195 """ 

196 df = preprocessing.clean_and_space_equally_time_series( 

197 df=self._obj, 

198 desired_freq=desired_freq 

199 ) 

200 if inplace: 

201 warnings.warn( 

202 "inplace=True on clean_and_space_equally has no effect when called " 

203 "via the .tsd accessor. Use the return value instead: " 

204 "df = df.tsd.clean_and_space_equally(freq, inplace=False)", 

205 FutureWarning, 

206 stacklevel=2, 

207 ) 

208 return df 

209 

210 def low_pass_filter(self, crit_freq, filter_order, variable): 

211 """ 

212 Call to the preprocessing function 

213 ebcpy.preprocessing.low_pass_filter() 

214 See the docstring of this function to know what is happening. 

215 

216 :param float crit_freq: 

217 The critical frequency or frequencies. 

218 :param int filter_order: 

219 The order of the filter 

220 :param str variable: 

221 The variable name to apply the filter to 

222 :return: numpy.ndarray 

223 Filtered data 

224 """ 

225 return preprocessing.low_pass_filter( 

226 data=self._obj[variable].to_numpy(), 

227 filter_order=filter_order, 

228 crit_freq=crit_freq 

229 ) 

230 

231 def moving_average(self, window, variable): 

232 """ 

233 Call to the preprocessing function 

234 ebcpy.preprocessing.moving_average() 

235 See the docstring of this function to know what is happening. 

236 

237 :param int window: 

238 sample rate of input 

239 :param str variable: 

240 The variable name to apply the filter to 

241 :return: numpy.ndarray 

242 Moving average result 

243 """ 

244 return preprocessing.moving_average( 

245 data=self._obj[variable].to_numpy(), 

246 window=window, 

247 ) 

248 

249 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]: 

250 """ 

251 Return an alphabetically sorted list of variable names, optionally filtered by patterns. 

252 

253 By default, returns all column names found in the DataFrame, sorted alphabetically. 

254 If `patterns` is provided, only names matching one or more of the given 

255 literal strings or glob-style patterns (where `*` matches any sequence of characters) 

256 will be returned. 

257 

258 :param patterns: 

259 - A single string or list of strings. 

260 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard. 

261 - If None, all variable names are returned. 

262 :return: 

263 A list of matching variable names, in alphabetical order. 

264 :raises KeyError: 

265 If any literal name or pattern does not match at least one variable in the DataFrame. 

266 

267 Example: 

268 # return all wall temperatures at any layer 

269 df.tsd.get_variable_names("*wall.layer[*].T") 

270 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"] 

271 """ 

272 all_names = sorted(self._obj.columns.get_level_values(0).unique()) 

273 if patterns is None: 

274 return all_names 

275 return get_names(all_names, patterns) 

276 

277 def number_lines_totally_na(self): 

278 """ 

279 Returns the number of rows in the given dataframe 

280 that are filled with NaN-values. 

281 """ 

282 return preprocessing.number_lines_totally_na(self._obj) 

283 

284 @property 

285 def frequency(self): 

286 """ 

287 The frequency of the time series data. 

288 Returns's the mean and the standard deviation of 

289 the index. 

290 

291 :returns: 

292 float: Mean value 

293 float: Standard deviation 

294 """ 

295 return preprocessing.get_df_index_frequency_mean_and_std( 

296 df_index=self._obj.index 

297 ) 

298 

299 

300class TimeSeriesData(pd.DataFrame): 

301 """ 

302 Most data related to energy and building 

303 climate related problems is time-variant. 

304 

305 Class for handling time series data using a pandas dataframe. 

306 This class works file-based and makes the import of different 

307 file-types into a pandas DataFrame more user-friendly. 

308 Furthermore, functions to support multi-indexing are provided to 

309 efficiently handle variable passed processing and provide easy 

310 visualization and preprocessing access. 

311 

312 :param str,os.path.normpath,pd.DataFrame data: 

313 Filepath ending with either .hdf, .mat, .csv, .parquet, 

314 or .parquet.COMPRESSION_NAME containing 

315 time-dependent data to be loaded as a pandas.DataFrame. 

316 Alternative option is to pass a DataFrame directly. 

317 :keyword str key: 

318 Name of the table in a .hdf-file if the file 

319 contains multiple tables. 

320 :keyword str sep: 

321 separator for the use of a csv file. If none is provided, 

322 a comma (",") is used as a default value. 

323 See pandas.read_csv() docs for further information. 

324 :keyword int, list header: 

325 Header columns for .csv files. 

326 See pandas.read_csv() docs for further information. 

327 Default is first row (0). 

328 :keyword int,str index_col: 

329 Column to be used as index in .csv files. 

330 See pandas.read_csv() docs for further information. 

331 Default is first column (0). 

332 :keyword str sheet_name: 

333 Name of the sheet you want to load data from. Required keyword 

334 argument when loading a xlsx-file. 

335 :keyword str default_tag: 

336 Which value to use as tag. Default is 'raw' 

337 :keyword str engine: 

338 Chose the engine for reading .parquet files. Default is 'pyarrow' 

339 Other option is 'fastparquet' (python>=3.9). 

340 :keyword list variable_names: 

341 List of variable names to load from .mat file. If you 

342 know which variables you want to plot, this may speed up 

343 loading significantly, and reduce memory size drastically. 

344 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.) 

345 to match multiple variables at once. 

346 

347 Examples: 

348 

349 First let's see the usage for a common dataframe. 

350 

351 >>> import numpy as np 

352 >>> from ebcpy import TimeSeriesData 

353 >>> tsd = TimeSeriesData({"my_variable": np.random.rand(5)}) 

354 >>> tsd.to_datetime_index() 

355 >>> tsd.save("my_new_data.csv") 

356 

357 Now, let's load the recently created file. 

358 

359 >>> tsd = TimeSeriesData("my_new_data.csv") 

360 """ 

361 

362 # normal properties 

363 _metadata = [ 

364 "_filepath", 

365 "_loader_kwargs", 

366 "_default_tag", 

367 "_multi_col_names" 

368 ] 

369 

370 def __init__(self, data: Union[str, Any], use_multicolumn: bool = False, **kwargs): 

371 """Initialize class-objects and check correct input.""" 

372 warnings.warn( 

373 "TimeSeriesData will be deprecated in the next major release. " 

374 "Instead, use 'load_time_series_data' to load files etc. as pd.DataFrame " 

375 "and use the 'tsd' accessor to access useful time-series-related functions " 

376 "as before with TimeSeriesData.", FutureWarning 

377 ) 

378 if use_multicolumn: 

379 warnings.warn( 

380 "All multicolumn support will be removed in the next major release", FutureWarning 

381 ) 

382 # Initialize as default 

383 self._filepath = None 

384 self._loader_kwargs = {} 

385 self._multi_col_names = ["Variables", "Tags"] 

386 self._default_tag = kwargs.pop("default_tag", "raw") 

387 if not isinstance(self._default_tag, str): 

388 raise TypeError(f"Invalid type for default_tag! Expected 'str' but " 

389 f"received {type(self._default_tag)}") 

390 

391 # Two possibles inputs. first argument is actually data provided by pandas 

392 # and kwargs hold further information or is it an actual filepath. 

393 if isinstance(data, BlockManager): 

394 super().__init__(data=data) 

395 return 

396 

397 if not isinstance(data, (str, Path)): 

398 _df_loaded = pd.DataFrame(data=data, 

399 index=kwargs.get("index", None), 

400 columns=kwargs.get("columns", None), 

401 dtype=kwargs.get("dtype", None), 

402 copy=kwargs.get("copy", False)) 

403 else: 

404 file = Path(data) 

405 self._loader_kwargs = kwargs.copy() 

406 _df_loaded = _load_df_from_file(file=file, **self._loader_kwargs) 

407 _df_loaded.tsd.filepath = file 

408 self._filepath = file 

409 

410 if _df_loaded.columns.nlevels == 1: 

411 # Check if first level is named Tags. 

412 # If so, don't create MultiIndex-DF as the method is called by the pd constructor 

413 if _df_loaded.columns.name != self._multi_col_names[1] and use_multicolumn: 

414 multi_col = pd.MultiIndex.from_product( 

415 [_df_loaded.columns, [self._default_tag]], 

416 names=self._multi_col_names 

417 ) 

418 _df_loaded.columns = multi_col 

419 

420 elif _df_loaded.columns.nlevels == 2: 

421 if _df_loaded.columns.names != self._multi_col_names and use_multicolumn: 

422 raise TypeError("Loaded dataframe has a different 2-Level " 

423 "header format than it is supported by this " 

424 "class. The names have to match.") 

425 else: 

426 raise TypeError("Only DataFrames with Multi-Columns with 2 " 

427 "Levels are supported by this class.") 

428 

429 super().__init__(_df_loaded) 

430 

431 @property 

432 def _constructor(self): 

433 """Overwrite constructor method according to: 

434 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

435 return TimeSeriesData 

436 

437 @property 

438 def _constructor_sliced(self): 

439 """Overwrite constructor method according to: 

440 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

441 return TimeSeries 

442 

443 @property 

444 def filepath(self) -> str: 

445 """Get the filepath associated with the time series data""" 

446 return self._filepath 

447 

448 @filepath.setter 

449 def filepath(self, filepath: str): 

450 """Set the filepath associated with the time series data""" 

451 self._filepath = Path(filepath) 

452 self.tsd.filepath = self._filepath 

453 

454 @property 

455 def default_tag(self) -> str: 

456 """Get the default of time series data object""" 

457 return self._default_tag 

458 

459 @default_tag.setter 

460 def default_tag(self, tag: str) -> None: 

461 """Set the default_tag of the time series data object 

462 :param tag: new tag 

463 :type tag: String 

464 """ 

465 if not isinstance(tag, str): 

466 raise TypeError(f"Invalid type for default_tag! Expected 'str' but " 

467 f"received {type(tag)}") 

468 if tag not in self.get_tags(): 

469 raise KeyError(f"Tag '{tag}' does not exist for current data set!" 

470 f"\n Available tags: {self.get_tags()}") 

471 self._default_tag = tag 

472 

473 def save(self, filepath: str = None, **kwargs) -> None: 

474 """ 

475 Save the current time-series-data into the given file-format. 

476 Currently supported are .hdf, which is an easy and fast storage, 

477 and, .csv is supported as an easy-readable option. 

478 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME 

479 are supported. Compressions could be gzip, brotli or snappy. For all possible 

480 compressions see the documentation of the parquet engines. 

481 For a small comparison of these data formats see https://github.com/RWTH-EBC/ebcpy/issues/81 

482 

483 :param str,os.path.normpath filepath: 

484 Filepath were to store the data. Either .hdf, .csv, .parquet 

485 or .parquet.COMPRESSION_NAME has to be the file-ending. 

486 Default is current filepath of class. 

487 :keyword str key: 

488 Necessary keyword-argument for saving a .hdf-file. 

489 Specifies the key of the table in the .hdf-file. 

490 :keyword str sep: 

491 Separator used for saving as .csv. Default is ','. 

492 :keyword str engine: 

493 Chose the engine for reading .parquet files. Default is 'pyarrow' 

494 Other option is 'fastparquet' (python>=3.9). 

495 :return: 

496 """ 

497 # If new settings are needed, update existing ones 

498 self._loader_kwargs.update(kwargs) 

499 self.tsd.save(filepath, **kwargs) 

500 

501 def to_df(self, force_single_index=False): 

502 """ 

503 Return the dataframe version of the current TimeSeriesData object. 

504 If all tags are equal, the tags are dropped. 

505 Else, the object is just converted. 

506 

507 :param bool force_single_index: 

508 If True (not the default), the conversion to a standard 

509 DataFrame with a single index column (only variable names) 

510 is only done if no variable contains multiple tags. 

511 """ 

512 if len(self.get_variables_with_multiple_tags()) == 0: 

513 if self._is_old_multicolumn_format: 

514 return pd.DataFrame(self.droplevel(1, axis=1)) 

515 return pd.DataFrame(self) 

516 if force_single_index: 

517 raise IndexError( 

518 "Can't automatically drop all tags " 

519 "as the following variables contain multiple tags: " 

520 f"{' ,'.join(self.get_variables_with_multiple_tags())}. " 

521 ) 

522 return pd.DataFrame(self) 

523 

524 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]: 

525 """ 

526 Return an alphabetically sorted list of variable names, optionally filtered by patterns. 

527 

528 By default, returns all variable names found in the first level of the DataFrame's 

529 column MultiIndex, sorted alphabetically. If `patterns` is provided, only names 

530 matching one or more of the given literal strings or glob-style patterns 

531 (where `*` matches any sequence of characters) will be returned. 

532 

533 :param patterns: 

534 - A single string or list of strings. 

535 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard. 

536 - If None, all variable names are returned. 

537 :return: 

538 A list of matching variable names, in alphabetical order. 

539 :raises KeyError: 

540 If any literal name or pattern does not match at least one variable in the DataFrame. 

541 

542 Example: 

543 # return all wall temperatures at any layer 

544 tsd.get_variable_names("*wall.layer[*].T") 

545 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"] 

546 """ 

547 return self.tsd.get_variable_names(patterns) 

548 

549 def get_variables_with_multiple_tags(self) -> List[str]: 

550 """ 

551 Return an alphabetically sorted list of all variables 

552 that contain more than one tag. 

553 

554 :return: List[str] 

555 """ 

556 var_names = self.columns.get_level_values(0) 

557 return sorted(var_names[var_names.duplicated()]) 

558 

559 def get_tags(self, variable: str = None) -> List[str]: 

560 """ 

561 Return an alphabetically sorted list of all tags 

562 

563 :param str variable: 

564 If given, tags of this variable are returned 

565 

566 :return: List[str] 

567 """ 

568 if not self._is_old_multicolumn_format: 

569 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!") 

570 if variable: 

571 tags = self.loc[:, variable].columns 

572 return sorted(tags) 

573 return sorted(self.columns.get_level_values(1).unique()) 

574 

575 @property 

576 def _is_old_multicolumn_format(self): 

577 """ 

578 Helper function to check if the old multicolumn format is used. 

579 """ 

580 return isinstance(self.columns, pd.MultiIndex) 

581 

582 def get_columns_by_tag(self, 

583 tag: str, 

584 variables: list = None, 

585 return_type: str = 'pandas', 

586 drop_level: bool = False): 

587 """ 

588 Returning all columns with defined tag in the form of ndarray. 

589 

590 :param str tag: 

591 Define the tag which return columns have to 

592 match. 

593 :param list variables: 

594 Besides the given tag, specify the 

595 variables names matching the return criteria as well. 

596 :param boolean drop_level: 

597 If tag should be included in the response. 

598 Default is True. 

599 :param str return_type: 

600 Return format. Options are: 

601 - pandas (pd.series) 

602 - numpy, scipy, sp, and np (np.array) 

603 - control (transposed np.array) 

604 :return: ndarray of input signals 

605 """ 

606 if not self._is_old_multicolumn_format: 

607 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!") 

608 

609 # Extract columns 

610 if variables: 

611 _ret = self.loc[:, variables] 

612 else: 

613 _ret = self 

614 

615 _ret = _ret.xs(tag, axis=1, level=1, drop_level=drop_level) 

616 

617 # Return based on the given return_type 

618 if return_type.lower() == 'pandas': 

619 return _ret 

620 if return_type.lower() in ['numpy', 'scipy', 'sp', 'np']: 

621 return _ret.to_numpy() 

622 if return_type.lower() == 'control': 

623 return _ret.to_numpy().transpose() 

624 raise TypeError("Unknown return type") 

625 

626 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace: bool = True): 

627 """ 

628 Convert the current index to a float based index using 

629 ebcpy.preprocessing.convert_index_to_datetime_index() 

630 

631 :param str unit_of_index: default 's' 

632 The unit of the given index. Used to convert to 

633 total_seconds later on. 

634 :param datetime.datetime origin: 

635 The reference datetime object for the first index. 

636 Default is the current system time. 

637 :param bool inplace: 

638 If True, performs operation inplace and returns None. 

639 :return: df 

640 Copy of DataFrame with correct index for usage in this 

641 framework. 

642 

643 """ 

644 return self.tsd.to_datetime_index(unit_of_index, origin, inplace) 

645 

646 def to_float_index(self, offset=0, inplace: bool = True): 

647 """ 

648 Convert the current index to a float based index using 

649 ebcpy.preprocessing.convert_datetime_index_to_float_index() 

650 

651 :param float offset: 

652 Offset in seconds 

653 :param bool inplace: 

654 If True, performs operation inplace and returns None. 

655 :return: pd.DataFrame df: 

656 DataFrame with correct index. 

657 """ 

658 return self.tsd.to_float_index(offset, inplace) 

659 

660 def clean_and_space_equally(self, desired_freq, inplace: bool = False): 

661 """ 

662 Call to the preprocessing function 

663 ebcpy.preprocessing.clean_and_space_equally_time_series() 

664 See the docstring of this function to know what is happening. 

665 

666 :param str desired_freq: 

667 Frequency to determine number of elements in processed dataframe. 

668 Options are for example: 

669 - s: second-based 

670 - 5s: Every 5 seconds 

671 - 6min: Every 6 minutes 

672 This also works for h, d, m, y, ms etc. 

673 :param bool inplace: 

674 Is Deprecated and use allways the return value! 

675 :return: pd.DataFrame 

676 Cleaned and equally spaced data-frame 

677 """ 

678 return self.tsd.clean_and_space_equally(desired_freq, inplace) 

679 

680 def low_pass_filter(self, crit_freq, filter_order, variable, 

681 tag=None, new_tag="low_pass_filter"): 

682 """ 

683 Call to the preprocessing function 

684 ebcpy.preprocessing.low_pass_filter() 

685 See the docstring of this function to know what is happening. 

686 If the old multicolumn format is used, the result is stored in the 

687 multicolumn header with the `new_tag`. 

688 

689 :param float crit_freq: 

690 The critical frequency or frequencies. 

691 :param int filter_order: 

692 The order of the filter 

693 :param str variable: 

694 The variable name to apply the filter to 

695 :param str tag: 

696 If this variable has more than one tag, specify which one 

697 :param str new_tag: 

698 The new tag to pass to the variable. 

699 Default is 'low_pass_filter' 

700 """ 

701 result = self.tsd.low_pass_filter(crit_freq, filter_order, self._possibly_get_variable_and_tag(variable, tag)) 

702 if self._is_old_multicolumn_format: 

703 self.loc[:, (variable, new_tag)] = result 

704 else: 

705 return result 

706 

707 def moving_average(self, window, variable, 

708 tag=None, new_tag="moving_average"): 

709 """ 

710 Call to the preprocessing function 

711 ebcpy.preprocessing.moving_average() 

712 See the docstring of this function to know what is happening. 

713 If the old multicolumn format is used, the result is stored in the 

714 multicolumn header with the `new_tag`. 

715 

716 :param int window: 

717 sample rate of input 

718 :param str variable: 

719 The variable name to apply the filter to 

720 :param str tag: 

721 If this variable has more than one tag, specify which one 

722 :param str new_tag: 

723 The new tag to pass to the variable. 

724 Default is 'low_pass_filter' 

725 """ 

726 result = self.tsd.moving_average(window, self._possibly_get_variable_and_tag(variable, tag)) 

727 if self._is_old_multicolumn_format: 

728 self.loc[:, (variable, new_tag)] = result 

729 else: 

730 return result 

731 

732 def _possibly_get_variable_and_tag(self, variable: str, tag: str = None): 

733 """ 

734 Helper function to get numpy array based on variable and possible tag name, 

735 depending on whether multicolumn is used or not. 

736 

737 :param str variable: 

738 The variable name to apply the filter to 

739 :param str tag: 

740 If this variable has more than one tag, specify which one 

741 

742 """ 

743 if tag is None: 

744 return variable 

745 if self._is_old_multicolumn_format: 

746 return (variable, tag) 

747 return variable 

748 

749 def number_lines_totally_na(self): 

750 """ 

751 Returns the number of rows in the given dataframe 

752 that are filled with NaN-values. 

753 """ 

754 return self.tsd.number_lines_totally_na() 

755 

756 @property 

757 def frequency(self): 

758 """ 

759 The frequency of the time series data. 

760 Returns's the mean and the standard deviation of 

761 the index. 

762 

763 :returns: 

764 float: Mean value 

765 float: Standard deviation 

766 """ 

767 return self.tsd.frequency 

768 

769 

770class TimeSeries(pd.Series): 

771 """Overwrites pd.Series to enable correct slicing 

772 and expansion in the TimeSeriesData class 

773 

774 .. versionadded:: 0.1.7 

775 """ 

776 

777 @property 

778 def _constructor(self): 

779 """Overwrite constructor method according to: 

780 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

781 return TimeSeries 

782 

783 @property 

784 def _constructor_expanddim(self): 

785 """Overwrite constructor method according to: 

786 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

787 return TimeSeriesData 

788 

789 

790def get_keys_of_hdf_file(filepath): 

791 """ 

792 Find all keys in a given hdf-file. 

793 

794 :param str,os.path.normpath filepath: 

795 Path to the .hdf-file 

796 :return: list 

797 List with all keys in the given file. 

798 """ 

799 # pylint: disable=import-outside-toplevel 

800 try: 

801 import h5py 

802 with h5py.File(filepath, 'r') as hdf_file: 

803 return list(hdf_file.keys()) 

804 except ImportError: 

805 return ["ERROR: Could not obtain keys as h5py is not installed"] 

806 

807 

808def load_time_series_data(data: Union[str, Any], **kwargs) -> pd.DataFrame: 

809 """ 

810 Load time series data from various sources into a pandas DataFrame with 

811 custom time series accessor methods available via .tsd property. 

812 

813 :param str,os.path.normpath,pd.DataFrame data: 

814 Filepath ending with either .hdf, .mat, .csv, .parquet, 

815 or .parquet.COMPRESSION_NAME containing 

816 time-dependent data to be loaded as a pandas.DataFrame. 

817 Alternative option is to pass a DataFrame directly. 

818 :keyword str key: 

819 Name of the table in a .hdf-file if the file 

820 contains multiple tables. 

821 :keyword str sep: 

822 separator for the use of a csv file. If none is provided, 

823 a comma (",") is used as a default value. 

824 See pandas.read_csv() docs for further information. 

825 :keyword int, list header: 

826 Header columns for .csv files. 

827 See pandas.read_csv() docs for further information. 

828 Default is first row (0). 

829 :keyword int,str index_col: 

830 Column to be used as index in .csv files. 

831 See pandas.read_csv() docs for further information. 

832 Default is first column (0). 

833 :keyword str sheet_name: 

834 Name of the sheet you want to load data from. Required keyword 

835 argument when loading a xlsx-file. 

836 :keyword str engine: 

837 Chose the engine for reading .parquet files. Default is 'pyarrow' 

838 Other option is 'fastparquet' (python>=3.9). 

839 :keyword list variable_names: 

840 List of variable names to load from .mat file. If you 

841 know which variables you want to plot, this may speed up 

842 loading significantly, and reduce memory size drastically. 

843 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.) 

844 to match multiple variables at once. 

845 :return: pd.DataFrame 

846 DataFrame with custom .tsd accessor containing time series functionality 

847 

848 Examples: 

849 

850 Create a DataFrame with random data: 

851 

852 >>> import numpy as np 

853 >>> from ebcpy import load_time_series_data 

854 >>> df = load_time_series_data({"my_variable": np.random.rand(5)}) 

855 >>> df.tsd.to_datetime_index() 

856 >>> df.tsd.save("my_new_data.csv") 

857 

858 Now, let's load the recently created file: 

859 

860 >>> df = load_time_series_data("my_new_data.csv") 

861 """ 

862 if isinstance(data, pd.DataFrame): 

863 df = data.copy() 

864 elif not isinstance(data, (str, Path)): 

865 df = pd.DataFrame(data=data, 

866 index=kwargs.get("index", None), 

867 columns=kwargs.get("columns", None), 

868 dtype=kwargs.get("dtype", None), 

869 copy=kwargs.get("copy", False)) 

870 else: 

871 # Load from file 

872 file = Path(data) 

873 df = _load_df_from_file(file=file, **kwargs) 

874 df.tsd.filepath = file 

875 

876 return df 

877 

878 

879def _load_df_from_file(file, **kwargs): 

880 """ 

881 Function to load a given filepath into a dataframe 

882 

883 :param Path file: File path to load 

884 :param kwargs: Additional loading parameters 

885 :return: pd.DataFrame 

886 """ # Check whether the file exists 

887 if not os.path.isfile(file): 

888 raise FileNotFoundError( 

889 f"The given filepath {file} could not be opened") 

890 

891 # Open based on file suffix. 

892 # Currently, hdf, csv, and Modelica result files (mat) are supported. 

893 if file.suffix == ".hdf": 

894 # Load the current file as a hdf to a dataframe. 

895 # As specifying the key can be a problem, the user will 

896 # get all keys of the file if one is necessary but not provided. 

897 key = kwargs.get("key") 

898 if key == "": 

899 key = None # Avoid cryptic error in pandas by converting empty string to None 

900 try: 

901 df = pd.read_hdf(file, key=key) 

902 except (ValueError, KeyError) as error: 

903 keys = ", ".join(get_keys_of_hdf_file(file)) 

904 raise KeyError(f"key must be provided when HDF5 file contains multiple datasets. " 

905 f"Here are all keys in the given hdf-file: {keys}") from error 

906 elif file.suffix == ".csv": 

907 # Check if file was previously a TimeSeriesData object 

908 with open(file, "r") as _f: 

909 lines = [_f.readline() for _ in range(2)] 

910 # Backwards compatible assumption: Users never changed '_multi_col_names' 

911 if (lines[0].startswith("Variables") and 

912 lines[1].startswith("Tags")): 

913 _hea_def = [0, 1] 

914 else: 

915 _hea_def = 0 

916 

917 df = pd.read_csv( 

918 file, 

919 sep=kwargs.get("sep", ","), 

920 index_col=kwargs.get("index_col", 0), 

921 header=kwargs.get("header", _hea_def) 

922 ) 

923 elif file.suffix == ".mat": 

924 df = sr.mat_to_pandas( 

925 fname=file, 

926 with_unit=False, 

927 names=kwargs.get("variable_names") 

928 ) 

929 elif file.suffix in ['.xlsx', '.xls', '.odf', '.ods', '.odt']: 

930 sheet_name = kwargs.get("sheet_name") 

931 if sheet_name is None: 

932 raise KeyError("sheet_name is a required keyword argument to load xlsx-files." 

933 "Please pass a string to specify the name " 

934 "of the sheet you want to load.") 

935 df = pd.read_excel(io=file, sheet_name=sheet_name) 

936 elif ".parquet" in file.name: 

937 df = pd.read_parquet(path=file, engine=kwargs.get('engine', 'pyarrow')) 

938 else: 

939 raise TypeError("Only .hdf, .csv, .xlsx and .mat are supported!") 

940 if not isinstance(df.index, tuple(datetime_indexes)) and not index_is_numeric(df.index): 

941 try: 

942 df.index = pd.DatetimeIndex(df.index) 

943 except Exception as err: 

944 raise IndexError( 

945 f"Given data has index of type {type(df.index)}. " 

946 f"Currently only numeric indexes and the following are supported:" 

947 f"{' ,'.join([str(idx) for idx in [pd.RangeIndex] + datetime_indexes])} " 

948 f"Automatic conversion to pd.DateTimeIndex failed" 

949 f"see error above." 

950 ) from err 

951 return df