Coverage for ebcpy/data_types.py: 96%

249 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2026-04-20 13:20 +0000

1""" 

2This module provides useful classes for all ebcpy. 

3Every data_type class should include every parameter 

4other classes like optimization etc. may need. The checking 

5of correct input is especially relevant here as the correct 

6format of data-types will prevent errors during simulations, 

7optimization etc. 

8""" 

9 

10import os 

11import warnings 

12from pathlib import Path 

13from typing import List, Union, Any, TYPE_CHECKING 

14from datetime import datetime 

15from pandas.core.internals import BlockManager 

16import pandas as pd 

17import numpy as np 

18import ebcpy.modelica.simres as sr 

19 

20from ebcpy.utils import get_names 

21from ebcpy import preprocessing 

22 

23# pylint: disable=I1101 

24# pylint: disable=too-many-ancestors 

25 

26__all__ = ['TimeSeries', 

27 'TimeSeriesData', 

28 'numeric_index_dtypes', 

29 'load_time_series_data', 

30 'index_is_numeric', 

31 'datetime_indexes'] 

32 

33numeric_index_dtypes = [ 

34 pd.Index([], dtype=dtype).dtype for dtype in 

35 ["int8", "int16", "int32", "int64", 

36 "uint8", "uint16", "uint32", "uint64", 

37 "float32", "float64"] 

38] 

39 

40datetime_indexes = [ 

41 pd.DatetimeIndex 

42] 

43 

44 

45def index_is_numeric(index: pd.Index): 

46 """Check if pandas Index is numeric""" 

47 return isinstance(index, pd.RangeIndex) or index.dtype in numeric_index_dtypes 

48 

49 

50@pd.api.extensions.register_dataframe_accessor("tsd") 

51class TimeSeriesAccessor: 

52 """ 

53 Pandas DataFrame accessor for time series functionality. 

54 Access using df.tsd.* 

55 """ 

56 

57 def __init__(self, pandas_obj): 

58 self._obj = pandas_obj 

59 self._filepath = None 

60 

61 @property 

62 def filepath(self): 

63 """Get the filepath associated with the time series data""" 

64 return self._filepath 

65 

66 @filepath.setter 

67 def filepath(self, filepath): 

68 """Set the filepath associated with the time series data""" 

69 self._filepath = Path(filepath) if filepath else None 

70 

71 def save(self, filepath: str = None, **kwargs) -> None: 

72 """ 

73 Save the current time-series-data into the given file-format. 

74 Currently supported are .hdf, which is an easy and fast storage, 

75 and, .csv is supported as an easy-readable option. 

76 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME 

77 are supported. 

78 

79 :param str,os.path.normpath filepath: 

80 Filepath were to store the data. Either .hdf, .csv, .parquet 

81 or .parquet.COMPRESSION_NAME has to be the file-ending. 

82 Default is current filepath of class. 

83 :keyword str key: 

84 Necessary keyword-argument for saving a .hdf-file. 

85 Specifies the key of the table in the .hdf-file. 

86 :keyword str sep: 

87 Separator used for saving as .csv. Default is ','. 

88 :keyword str engine: 

89 Chose the engine for reading .parquet files. Default is 'pyarrow' 

90 Other option is 'fastparquet' (python>=3.9). 

91 """ 

92 # Set filepath if not given 

93 if filepath is None: 

94 if self.filepath is None: 

95 raise FileNotFoundError( 

96 "TimeSeriesData has neither a filepath stored in tsd " 

97 "accessor nor did you provide a filepath were to store the data." 

98 ) 

99 filepath = self.filepath 

100 else: 

101 filepath = Path(filepath) 

102 

103 # Check if filepath is still None 

104 if filepath is None: 

105 raise ValueError("No filepath specified and no default filepath is set.") 

106 

107 # Save based on file suffix 

108 if filepath.suffix == ".hdf": 

109 if "key" not in kwargs: 

110 raise KeyError("Argument 'key' must be specified to save a .hdf file") 

111 self._obj.to_hdf(filepath, key=kwargs.get("key")) 

112 elif filepath.suffix == ".csv": 

113 self._obj.to_csv(filepath, sep=kwargs.get("sep", ",")) 

114 elif ".parquet" in filepath.name: 

115 parquet_split = filepath.name.split(".parquet") 

116 self._obj.to_parquet( 

117 filepath, engine=kwargs.get('engine', 'pyarrow'), 

118 compression=parquet_split[-1][1:] if parquet_split[-1] else None, 

119 index=True 

120 ) 

121 else: 

122 raise TypeError("Given file-format is not supported." 

123 "You can only store time series data as .hdf, .csv, .parquet, " 

124 "and .parquet.COMPRESSION_NAME with additional compression options") 

125 

126 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace=True): 

127 """ 

128 Convert the current index to a datetime index using 

129 ebcpy.preprocessing.convert_index_to_datetime_index() 

130 

131 :param str unit_of_index: default 's' 

132 The unit of the given index. Used to convert to 

133 total_seconds later on. 

134 :param datetime.datetime origin: 

135 The reference datetime object for the first index. 

136 Default is the current system time. 

137 :param bool inplace: 

138 If True, performs operation inplace and returns None. 

139 :return: df 

140 Copy of DataFrame with correct index for usage in this 

141 framework. 

142 """ 

143 return preprocessing.convert_index_to_datetime_index( 

144 df=self._obj, 

145 unit_of_index=unit_of_index, 

146 origin=origin, 

147 inplace=inplace 

148 ) 

149 

150 def to_float_index(self, offset=0, inplace=True): 

151 """ 

152 Convert the current index to a float based index using 

153 ebcpy.preprocessing.convert_datetime_index_to_float_index() 

154 

155 :param float offset: 

156 Offset in seconds 

157 :param bool inplace: 

158 If True, performs operation inplace and returns None. 

159 :return: pd.DataFrame df: 

160 DataFrame with correct index. 

161 """ 

162 if not isinstance(self._obj.index, pd.DatetimeIndex): 

163 if inplace: 

164 return None 

165 return self._obj 

166 

167 return preprocessing.convert_datetime_index_to_float_index( 

168 df=self._obj, 

169 offset=offset, 

170 inplace=inplace 

171 ) 

172 

173 def clean_and_space_equally(self, desired_freq, inplace=False): 

174 """ 

175 Call to the preprocessing function 

176 ebcpy.preprocessing.clean_and_space_equally_time_series() 

177 See the docstring of this function to know what is happening. 

178 

179 :param str desired_freq: 

180 Frequency to determine number of elements in processed dataframe. 

181 Options are for example: 

182 - s: second-based 

183 - 5s: Every 5 seconds 

184 - 6min: Every 6 minutes 

185 This also works for h, d, m, y, ms etc. 

186 :param bool inplace: 

187 Is Deprecated and use allways the return value! 

188 :return: pd.DataFrame 

189 Cleaned and equally spaced data-frame 

190 """ 

191 df = preprocessing.clean_and_space_equally_time_series( 

192 df=self._obj, 

193 desired_freq=desired_freq 

194 ) 

195 if inplace: 

196 warnings.warn( 

197 "inplace=True on clean_and_space_equally has no effect when called " 

198 "via the .tsd accessor. Use the return value instead: " 

199 "df = df.tsd.clean_and_space_equally(freq, inplace=False)", 

200 FutureWarning, 

201 stacklevel=2, 

202 ) 

203 return df 

204 

205 def low_pass_filter(self, crit_freq, filter_order, variable): 

206 """ 

207 Call to the preprocessing function 

208 ebcpy.preprocessing.low_pass_filter() 

209 See the docstring of this function to know what is happening. 

210 

211 :param float crit_freq: 

212 The critical frequency or frequencies. 

213 :param int filter_order: 

214 The order of the filter 

215 :param str variable: 

216 The variable name to apply the filter to 

217 :return: numpy.ndarray 

218 Filtered data 

219 """ 

220 return preprocessing.low_pass_filter( 

221 data=self._obj[variable].to_numpy(), 

222 filter_order=filter_order, 

223 crit_freq=crit_freq 

224 ) 

225 

226 def moving_average(self, window, variable): 

227 """ 

228 Call to the preprocessing function 

229 ebcpy.preprocessing.moving_average() 

230 See the docstring of this function to know what is happening. 

231 

232 :param int window: 

233 sample rate of input 

234 :param str variable: 

235 The variable name to apply the filter to 

236 :return: numpy.ndarray 

237 Moving average result 

238 """ 

239 return preprocessing.moving_average( 

240 data=self._obj[variable].to_numpy(), 

241 window=window, 

242 ) 

243 

244 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]: 

245 """ 

246 Return an alphabetically sorted list of variable names, optionally filtered by patterns. 

247 

248 By default, returns all column names found in the DataFrame, sorted alphabetically. 

249 If `patterns` is provided, only names matching one or more of the given 

250 literal strings or glob-style patterns (where `*` matches any sequence of characters) 

251 will be returned. 

252 

253 :param patterns: 

254 - A single string or list of strings. 

255 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard. 

256 - If None, all variable names are returned. 

257 :return: 

258 A list of matching variable names, in alphabetical order. 

259 :raises KeyError: 

260 If any literal name or pattern does not match at least one variable in the DataFrame. 

261 

262 Example: 

263 # return all wall temperatures at any layer 

264 df.tsd.get_variable_names("*wall.layer[*].T") 

265 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"] 

266 """ 

267 all_names = sorted(self._obj.columns.get_level_values(0).unique()) 

268 if patterns is None: 

269 return all_names 

270 return get_names(all_names, patterns) 

271 

272 def number_lines_totally_na(self): 

273 """ 

274 Returns the number of rows in the given dataframe 

275 that are filled with NaN-values. 

276 """ 

277 return preprocessing.number_lines_totally_na(self._obj) 

278 

279 @property 

280 def frequency(self): 

281 """ 

282 The frequency of the time series data. 

283 Returns's the mean and the standard deviation of 

284 the index. 

285 

286 :returns: 

287 float: Mean value 

288 float: Standard deviation 

289 """ 

290 return preprocessing.get_df_index_frequency_mean_and_std( 

291 df_index=self._obj.index 

292 ) 

293 

294 

295class TimeSeriesData(pd.DataFrame): 

296 """ 

297 Most data related to energy and building 

298 climate related problems is time-variant. 

299 

300 Class for handling time series data using a pandas dataframe. 

301 This class works file-based and makes the import of different 

302 file-types into a pandas DataFrame more user-friendly. 

303 Furthermore, functions to support multi-indexing are provided to 

304 efficiently handle variable passed processing and provide easy 

305 visualization and preprocessing access. 

306 

307 :param str,os.path.normpath,pd.DataFrame data: 

308 Filepath ending with either .hdf, .mat, .csv, .parquet, 

309 or .parquet.COMPRESSION_NAME containing 

310 time-dependent data to be loaded as a pandas.DataFrame. 

311 Alternative option is to pass a DataFrame directly. 

312 :keyword str key: 

313 Name of the table in a .hdf-file if the file 

314 contains multiple tables. 

315 :keyword str sep: 

316 separator for the use of a csv file. If none is provided, 

317 a comma (",") is used as a default value. 

318 See pandas.read_csv() docs for further information. 

319 :keyword int, list header: 

320 Header columns for .csv files. 

321 See pandas.read_csv() docs for further information. 

322 Default is first row (0). 

323 :keyword int,str index_col: 

324 Column to be used as index in .csv files. 

325 See pandas.read_csv() docs for further information. 

326 Default is first column (0). 

327 :keyword str sheet_name: 

328 Name of the sheet you want to load data from. Required keyword 

329 argument when loading a xlsx-file. 

330 :keyword str default_tag: 

331 Which value to use as tag. Default is 'raw' 

332 :keyword str engine: 

333 Chose the engine for reading .parquet files. Default is 'pyarrow' 

334 Other option is 'fastparquet' (python>=3.9). 

335 :keyword list variable_names: 

336 List of variable names to load from .mat file. If you 

337 know which variables you want to plot, this may speed up 

338 loading significantly, and reduce memory size drastically. 

339 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.) 

340 to match multiple variables at once. 

341 

342 Examples: 

343 

344 First let's see the usage for a common dataframe. 

345 

346 >>> import numpy as np 

347 >>> from ebcpy import TimeSeriesData 

348 >>> tsd = TimeSeriesData({"my_variable": np.random.rand(5)}) 

349 >>> tsd.to_datetime_index() 

350 >>> tsd.save("my_new_data.csv") 

351 

352 Now, let's load the recently created file. 

353 

354 >>> tsd = TimeSeriesData("my_new_data.csv") 

355 """ 

356 

357 # normal properties 

358 _metadata = [ 

359 "_filepath", 

360 "_loader_kwargs", 

361 "_default_tag", 

362 "_multi_col_names" 

363 ] 

364 

365 def __init__(self, data: Union[str, Any], use_multicolumn: bool = False, **kwargs): 

366 """Initialize class-objects and check correct input.""" 

367 warnings.warn( 

368 "TimeSeriesData will be deprecated in the next major release. " 

369 "Instead, use 'load_time_series_data' to load files etc. as pd.DataFrame " 

370 "and use the 'tsd' accessor to access useful time-series-related functions " 

371 "as before with TimeSeriesData.", FutureWarning 

372 ) 

373 if use_multicolumn: 

374 warnings.warn( 

375 "All multicolumn support will be removed in the next major release", FutureWarning 

376 ) 

377 # Initialize as default 

378 self._filepath = None 

379 self._loader_kwargs = {} 

380 self._multi_col_names = ["Variables", "Tags"] 

381 self._default_tag = kwargs.pop("default_tag", "raw") 

382 if not isinstance(self._default_tag, str): 

383 raise TypeError(f"Invalid type for default_tag! Expected 'str' but " 

384 f"received {type(self._default_tag)}") 

385 

386 # Two possibles inputs. first argument is actually data provided by pandas 

387 # and kwargs hold further information or is it an actual filepath. 

388 if isinstance(data, BlockManager): 

389 super().__init__(data=data) 

390 return 

391 

392 if not isinstance(data, (str, Path)): 

393 _df_loaded = pd.DataFrame(data=data, 

394 index=kwargs.get("index", None), 

395 columns=kwargs.get("columns", None), 

396 dtype=kwargs.get("dtype", None), 

397 copy=kwargs.get("copy", False)) 

398 else: 

399 file = Path(data) 

400 self._loader_kwargs = kwargs.copy() 

401 _df_loaded = _load_df_from_file(file=file, **self._loader_kwargs) 

402 _df_loaded.tsd.filepath = file 

403 self._filepath = file 

404 

405 if _df_loaded.columns.nlevels == 1: 

406 # Check if first level is named Tags. 

407 # If so, don't create MultiIndex-DF as the method is called by the pd constructor 

408 if _df_loaded.columns.name != self._multi_col_names[1] and use_multicolumn: 

409 multi_col = pd.MultiIndex.from_product( 

410 [_df_loaded.columns, [self._default_tag]], 

411 names=self._multi_col_names 

412 ) 

413 _df_loaded.columns = multi_col 

414 

415 elif _df_loaded.columns.nlevels == 2: 

416 if _df_loaded.columns.names != self._multi_col_names and use_multicolumn: 

417 raise TypeError("Loaded dataframe has a different 2-Level " 

418 "header format than it is supported by this " 

419 "class. The names have to match.") 

420 else: 

421 raise TypeError("Only DataFrames with Multi-Columns with 2 " 

422 "Levels are supported by this class.") 

423 

424 super().__init__(_df_loaded) 

425 

426 @property 

427 def _constructor(self): 

428 """Overwrite constructor method according to: 

429 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

430 return TimeSeriesData 

431 

432 @property 

433 def _constructor_sliced(self): 

434 """Overwrite constructor method according to: 

435 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

436 return TimeSeries 

437 

438 @property 

439 def filepath(self) -> str: 

440 """Get the filepath associated with the time series data""" 

441 return self._filepath 

442 

443 @filepath.setter 

444 def filepath(self, filepath: str): 

445 """Set the filepath associated with the time series data""" 

446 self._filepath = Path(filepath) 

447 self.tsd.filepath = self._filepath 

448 

449 @property 

450 def default_tag(self) -> str: 

451 """Get the default of time series data object""" 

452 return self._default_tag 

453 

454 @default_tag.setter 

455 def default_tag(self, tag: str) -> None: 

456 """Set the default_tag of the time series data object 

457 :param tag: new tag 

458 :type tag: String 

459 """ 

460 if not isinstance(tag, str): 

461 raise TypeError(f"Invalid type for default_tag! Expected 'str' but " 

462 f"received {type(tag)}") 

463 if tag not in self.get_tags(): 

464 raise KeyError(f"Tag '{tag}' does not exist for current data set!" 

465 f"\n Available tags: {self.get_tags()}") 

466 self._default_tag = tag 

467 

468 def save(self, filepath: str = None, **kwargs) -> None: 

469 """ 

470 Save the current time-series-data into the given file-format. 

471 Currently supported are .hdf, which is an easy and fast storage, 

472 and, .csv is supported as an easy-readable option. 

473 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME 

474 are supported. Compressions could be gzip, brotli or snappy. For all possible 

475 compressions see the documentation of the parquet engines. 

476 For a small comparison of these data formats see https://github.com/RWTH-EBC/ebcpy/issues/81 

477 

478 :param str,os.path.normpath filepath: 

479 Filepath were to store the data. Either .hdf, .csv, .parquet 

480 or .parquet.COMPRESSION_NAME has to be the file-ending. 

481 Default is current filepath of class. 

482 :keyword str key: 

483 Necessary keyword-argument for saving a .hdf-file. 

484 Specifies the key of the table in the .hdf-file. 

485 :keyword str sep: 

486 Separator used for saving as .csv. Default is ','. 

487 :keyword str engine: 

488 Chose the engine for reading .parquet files. Default is 'pyarrow' 

489 Other option is 'fastparquet' (python>=3.9). 

490 :return: 

491 """ 

492 # If new settings are needed, update existing ones 

493 self._loader_kwargs.update(kwargs) 

494 self.tsd.save(filepath, **kwargs) 

495 

496 def to_df(self, force_single_index=False): 

497 """ 

498 Return the dataframe version of the current TimeSeriesData object. 

499 If all tags are equal, the tags are dropped. 

500 Else, the object is just converted. 

501 

502 :param bool force_single_index: 

503 If True (not the default), the conversion to a standard 

504 DataFrame with a single index column (only variable names) 

505 is only done if no variable contains multiple tags. 

506 """ 

507 if len(self.get_variables_with_multiple_tags()) == 0: 

508 if self._is_old_multicolumn_format: 

509 return pd.DataFrame(self.droplevel(1, axis=1)) 

510 return pd.DataFrame(self) 

511 if force_single_index: 

512 raise IndexError( 

513 "Can't automatically drop all tags " 

514 "as the following variables contain multiple tags: " 

515 f"{' ,'.join(self.get_variables_with_multiple_tags())}. " 

516 ) 

517 return pd.DataFrame(self) 

518 

519 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]: 

520 """ 

521 Return an alphabetically sorted list of variable names, optionally filtered by patterns. 

522 

523 By default, returns all variable names found in the first level of the DataFrame's 

524 column MultiIndex, sorted alphabetically. If `patterns` is provided, only names 

525 matching one or more of the given literal strings or glob-style patterns 

526 (where `*` matches any sequence of characters) will be returned. 

527 

528 :param patterns: 

529 - A single string or list of strings. 

530 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard. 

531 - If None, all variable names are returned. 

532 :return: 

533 A list of matching variable names, in alphabetical order. 

534 :raises KeyError: 

535 If any literal name or pattern does not match at least one variable in the DataFrame. 

536 

537 Example: 

538 # return all wall temperatures at any layer 

539 tsd.get_variable_names("*wall.layer[*].T") 

540 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"] 

541 """ 

542 return self.tsd.get_variable_names(patterns) 

543 

544 def get_variables_with_multiple_tags(self) -> List[str]: 

545 """ 

546 Return an alphabetically sorted list of all variables 

547 that contain more than one tag. 

548 

549 :return: List[str] 

550 """ 

551 var_names = self.columns.get_level_values(0) 

552 return sorted(var_names[var_names.duplicated()]) 

553 

554 def get_tags(self, variable: str = None) -> List[str]: 

555 """ 

556 Return an alphabetically sorted list of all tags 

557 

558 :param str variable: 

559 If given, tags of this variable are returned 

560 

561 :return: List[str] 

562 """ 

563 if not self._is_old_multicolumn_format: 

564 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!") 

565 if variable: 

566 tags = self.loc[:, variable].columns 

567 return sorted(tags) 

568 return sorted(self.columns.get_level_values(1).unique()) 

569 

570 @property 

571 def _is_old_multicolumn_format(self): 

572 """ 

573 Helper function to check if the old multicolumn format is used. 

574 """ 

575 return isinstance(self.columns, pd.MultiIndex) 

576 

577 def get_columns_by_tag(self, 

578 tag: str, 

579 variables: list = None, 

580 return_type: str = 'pandas', 

581 drop_level: bool = False): 

582 """ 

583 Returning all columns with defined tag in the form of ndarray. 

584 

585 :param str tag: 

586 Define the tag which return columns have to 

587 match. 

588 :param list variables: 

589 Besides the given tag, specify the 

590 variables names matching the return criteria as well. 

591 :param boolean drop_level: 

592 If tag should be included in the response. 

593 Default is True. 

594 :param str return_type: 

595 Return format. Options are: 

596 - pandas (pd.series) 

597 - numpy, scipy, sp, and np (np.array) 

598 - control (transposed np.array) 

599 :return: ndarray of input signals 

600 """ 

601 if not self._is_old_multicolumn_format: 

602 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!") 

603 

604 # Extract columns 

605 if variables: 

606 _ret = self.loc[:, variables] 

607 else: 

608 _ret = self 

609 

610 _ret = _ret.xs(tag, axis=1, level=1, drop_level=drop_level) 

611 

612 # Return based on the given return_type 

613 if return_type.lower() == 'pandas': 

614 return _ret 

615 if return_type.lower() in ['numpy', 'scipy', 'sp', 'np']: 

616 return _ret.to_numpy() 

617 if return_type.lower() == 'control': 

618 return _ret.to_numpy().transpose() 

619 raise TypeError("Unknown return type") 

620 

621 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace: bool = True): 

622 """ 

623 Convert the current index to a float based index using 

624 ebcpy.preprocessing.convert_index_to_datetime_index() 

625 

626 :param str unit_of_index: default 's' 

627 The unit of the given index. Used to convert to 

628 total_seconds later on. 

629 :param datetime.datetime origin: 

630 The reference datetime object for the first index. 

631 Default is the current system time. 

632 :param bool inplace: 

633 If True, performs operation inplace and returns None. 

634 :return: df 

635 Copy of DataFrame with correct index for usage in this 

636 framework. 

637 

638 """ 

639 return self.tsd.to_datetime_index(unit_of_index, origin, inplace) 

640 

641 def to_float_index(self, offset=0, inplace: bool = True): 

642 """ 

643 Convert the current index to a float based index using 

644 ebcpy.preprocessing.convert_datetime_index_to_float_index() 

645 

646 :param float offset: 

647 Offset in seconds 

648 :param bool inplace: 

649 If True, performs operation inplace and returns None. 

650 :return: pd.DataFrame df: 

651 DataFrame with correct index. 

652 """ 

653 return self.tsd.to_float_index(offset, inplace) 

654 

655 def clean_and_space_equally(self, desired_freq, inplace: bool = False): 

656 """ 

657 Call to the preprocessing function 

658 ebcpy.preprocessing.clean_and_space_equally_time_series() 

659 See the docstring of this function to know what is happening. 

660 

661 :param str desired_freq: 

662 Frequency to determine number of elements in processed dataframe. 

663 Options are for example: 

664 - s: second-based 

665 - 5s: Every 5 seconds 

666 - 6min: Every 6 minutes 

667 This also works for h, d, m, y, ms etc. 

668 :param bool inplace: 

669 Is Deprecated and use allways the return value! 

670 :return: pd.DataFrame 

671 Cleaned and equally spaced data-frame 

672 """ 

673 return self.tsd.clean_and_space_equally(desired_freq, inplace) 

674 

675 def low_pass_filter(self, crit_freq, filter_order, variable, 

676 tag=None, new_tag="low_pass_filter"): 

677 """ 

678 Call to the preprocessing function 

679 ebcpy.preprocessing.low_pass_filter() 

680 See the docstring of this function to know what is happening. 

681 If the old multicolumn format is used, the result is stored in the 

682 multicolumn header with the `new_tag`. 

683 

684 :param float crit_freq: 

685 The critical frequency or frequencies. 

686 :param int filter_order: 

687 The order of the filter 

688 :param str variable: 

689 The variable name to apply the filter to 

690 :param str tag: 

691 If this variable has more than one tag, specify which one 

692 :param str new_tag: 

693 The new tag to pass to the variable. 

694 Default is 'low_pass_filter' 

695 """ 

696 result = self.tsd.low_pass_filter(crit_freq, filter_order, self._possibly_get_variable_and_tag(variable, tag)) 

697 if self._is_old_multicolumn_format: 

698 self.loc[:, (variable, new_tag)] = result 

699 else: 

700 return result 

701 

702 def moving_average(self, window, variable, 

703 tag=None, new_tag="moving_average"): 

704 """ 

705 Call to the preprocessing function 

706 ebcpy.preprocessing.moving_average() 

707 See the docstring of this function to know what is happening. 

708 If the old multicolumn format is used, the result is stored in the 

709 multicolumn header with the `new_tag`. 

710 

711 :param int window: 

712 sample rate of input 

713 :param str variable: 

714 The variable name to apply the filter to 

715 :param str tag: 

716 If this variable has more than one tag, specify which one 

717 :param str new_tag: 

718 The new tag to pass to the variable. 

719 Default is 'low_pass_filter' 

720 """ 

721 result = self.tsd.moving_average(window, self._possibly_get_variable_and_tag(variable, tag)) 

722 if self._is_old_multicolumn_format: 

723 self.loc[:, (variable, new_tag)] = result 

724 else: 

725 return result 

726 

727 def _possibly_get_variable_and_tag(self, variable: str, tag: str = None): 

728 """ 

729 Helper function to get numpy array based on variable and possible tag name, 

730 depending on whether multicolumn is used or not. 

731 

732 :param str variable: 

733 The variable name to apply the filter to 

734 :param str tag: 

735 If this variable has more than one tag, specify which one 

736 

737 """ 

738 if tag is None: 

739 return variable 

740 if self._is_old_multicolumn_format: 

741 return (variable, tag) 

742 return variable 

743 

744 def number_lines_totally_na(self): 

745 """ 

746 Returns the number of rows in the given dataframe 

747 that are filled with NaN-values. 

748 """ 

749 return self.tsd.number_lines_totally_na() 

750 

751 @property 

752 def frequency(self): 

753 """ 

754 The frequency of the time series data. 

755 Returns's the mean and the standard deviation of 

756 the index. 

757 

758 :returns: 

759 float: Mean value 

760 float: Standard deviation 

761 """ 

762 return self.tsd.frequency 

763 

764 

765class TimeSeries(pd.Series): 

766 """Overwrites pd.Series to enable correct slicing 

767 and expansion in the TimeSeriesData class 

768 

769 .. versionadded:: 0.1.7 

770 """ 

771 

772 @property 

773 def _constructor(self): 

774 """Overwrite constructor method according to: 

775 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

776 return TimeSeries 

777 

778 @property 

779 def _constructor_expanddim(self): 

780 """Overwrite constructor method according to: 

781 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

782 return TimeSeriesData 

783 

784 

785def get_keys_of_hdf_file(filepath): 

786 """ 

787 Find all keys in a given hdf-file. 

788 

789 :param str,os.path.normpath filepath: 

790 Path to the .hdf-file 

791 :return: list 

792 List with all keys in the given file. 

793 """ 

794 # pylint: disable=import-outside-toplevel 

795 try: 

796 import h5py 

797 with h5py.File(filepath, 'r') as hdf_file: 

798 return list(hdf_file.keys()) 

799 except ImportError: 

800 return ["ERROR: Could not obtain keys as h5py is not installed"] 

801 

802 

803def load_time_series_data(data: Union[str, Any], **kwargs) -> pd.DataFrame: 

804 """ 

805 Load time series data from various sources into a pandas DataFrame with 

806 custom time series accessor methods available via .tsd property. 

807 

808 :param str,os.path.normpath,pd.DataFrame data: 

809 Filepath ending with either .hdf, .mat, .csv, .parquet, 

810 or .parquet.COMPRESSION_NAME containing 

811 time-dependent data to be loaded as a pandas.DataFrame. 

812 Alternative option is to pass a DataFrame directly. 

813 :keyword str key: 

814 Name of the table in a .hdf-file if the file 

815 contains multiple tables. 

816 :keyword str sep: 

817 separator for the use of a csv file. If none is provided, 

818 a comma (",") is used as a default value. 

819 See pandas.read_csv() docs for further information. 

820 :keyword int, list header: 

821 Header columns for .csv files. 

822 See pandas.read_csv() docs for further information. 

823 Default is first row (0). 

824 :keyword int,str index_col: 

825 Column to be used as index in .csv files. 

826 See pandas.read_csv() docs for further information. 

827 Default is first column (0). 

828 :keyword str sheet_name: 

829 Name of the sheet you want to load data from. Required keyword 

830 argument when loading a xlsx-file. 

831 :keyword str engine: 

832 Chose the engine for reading .parquet files. Default is 'pyarrow' 

833 Other option is 'fastparquet' (python>=3.9). 

834 :keyword list variable_names: 

835 List of variable names to load from .mat file. If you 

836 know which variables you want to plot, this may speed up 

837 loading significantly, and reduce memory size drastically. 

838 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.) 

839 to match multiple variables at once. 

840 :return: pd.DataFrame 

841 DataFrame with custom .tsd accessor containing time series functionality 

842 

843 Examples: 

844 

845 Create a DataFrame with random data: 

846 

847 >>> import numpy as np 

848 >>> from ebcpy import load_time_series_data 

849 >>> df = load_time_series_data({"my_variable": np.random.rand(5)}) 

850 >>> df.tsd.to_datetime_index() 

851 >>> df.tsd.save("my_new_data.csv") 

852 

853 Now, let's load the recently created file: 

854 

855 >>> df = load_time_series_data("my_new_data.csv") 

856 """ 

857 if isinstance(data, pd.DataFrame): 

858 df = data.copy() 

859 elif not isinstance(data, (str, Path)): 

860 df = pd.DataFrame(data=data, 

861 index=kwargs.get("index", None), 

862 columns=kwargs.get("columns", None), 

863 dtype=kwargs.get("dtype", None), 

864 copy=kwargs.get("copy", False)) 

865 else: 

866 # Load from file 

867 file = Path(data) 

868 df = _load_df_from_file(file=file, **kwargs) 

869 df.tsd.filepath = file 

870 

871 return df 

872 

873 

874def _load_df_from_file(file, **kwargs): 

875 """ 

876 Function to load a given filepath into a dataframe 

877 

878 :param Path file: File path to load 

879 :param kwargs: Additional loading parameters 

880 :return: pd.DataFrame 

881 """ # Check whether the file exists 

882 if not os.path.isfile(file): 

883 raise FileNotFoundError( 

884 f"The given filepath {file} could not be opened") 

885 

886 # Open based on file suffix. 

887 # Currently, hdf, csv, and Modelica result files (mat) are supported. 

888 if file.suffix == ".hdf": 

889 # Load the current file as a hdf to a dataframe. 

890 # As specifying the key can be a problem, the user will 

891 # get all keys of the file if one is necessary but not provided. 

892 key = kwargs.get("key") 

893 if key == "": 

894 key = None # Avoid cryptic error in pandas by converting empty string to None 

895 try: 

896 df = pd.read_hdf(file, key=key) 

897 except (ValueError, KeyError) as error: 

898 keys = ", ".join(get_keys_of_hdf_file(file)) 

899 raise KeyError(f"key must be provided when HDF5 file contains multiple datasets. " 

900 f"Here are all keys in the given hdf-file: {keys}") from error 

901 elif file.suffix == ".csv": 

902 # Check if file was previously a TimeSeriesData object 

903 with open(file, "r") as _f: 

904 lines = [_f.readline() for _ in range(2)] 

905 # Backwards compatible assumption: Users never changed '_multi_col_names' 

906 if (lines[0].startswith("Variables") and 

907 lines[1].startswith("Tags")): 

908 _hea_def = [0, 1] 

909 else: 

910 _hea_def = 0 

911 

912 df = pd.read_csv( 

913 file, 

914 sep=kwargs.get("sep", ","), 

915 index_col=kwargs.get("index_col", 0), 

916 header=kwargs.get("header", _hea_def) 

917 ) 

918 elif file.suffix == ".mat": 

919 df = sr.mat_to_pandas( 

920 fname=file, 

921 with_unit=False, 

922 names=kwargs.get("variable_names") 

923 ) 

924 elif file.suffix in ['.xlsx', '.xls', '.odf', '.ods', '.odt']: 

925 sheet_name = kwargs.get("sheet_name") 

926 if sheet_name is None: 

927 raise KeyError("sheet_name is a required keyword argument to load xlsx-files." 

928 "Please pass a string to specify the name " 

929 "of the sheet you want to load.") 

930 df = pd.read_excel(io=file, sheet_name=sheet_name) 

931 elif ".parquet" in file.name: 

932 df = pd.read_parquet(path=file, engine=kwargs.get('engine', 'pyarrow')) 

933 else: 

934 raise TypeError("Only .hdf, .csv, .xlsx and .mat are supported!") 

935 if not isinstance(df.index, tuple(datetime_indexes)) and not index_is_numeric(df.index): 

936 try: 

937 df.index = pd.DatetimeIndex(df.index) 

938 except Exception as err: 

939 raise IndexError( 

940 f"Given data has index of type {type(df.index)}. " 

941 f"Currently only numeric indexes and the following are supported:" 

942 f"{' ,'.join([str(idx) for idx in [pd.RangeIndex] + datetime_indexes])} " 

943 f"Automatic conversion to pd.DateTimeIndex failed" 

944 f"see error above." 

945 ) from err 

946 return df