Coverage for ebcpy/data_types.py: 96%

250 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-08-26 09:12 +0000

1""" 

2This module provides useful classes for all ebcpy. 

3Every data_type class should include every parameter 

4other classes like optimization etc. may need. The checking 

5of correct input is especially relevant here as the correct 

6format of data-types will prevent errors during simulations, 

7optimization etc. 

8""" 

9 

10import os 

11import warnings 

12from pathlib import Path 

13from typing import List, Union, Any, TYPE_CHECKING 

14from datetime import datetime 

15from pandas.core.internals import BlockManager 

16import pandas as pd 

17import numpy as np 

18import ebcpy.modelica.simres as sr 

19 

20from ebcpy.utils import get_names 

21from ebcpy import preprocessing 

22 

23# pylint: disable=I1101 

24# pylint: disable=too-many-ancestors 

25 

26__all__ = ['TimeSeries', 

27 'TimeSeriesData', 

28 'numeric_index_dtypes', 

29 'load_time_series_data', 

30 'index_is_numeric', 

31 'datetime_indexes'] 

32 

33numeric_index_dtypes = [ 

34 pd.Index([], dtype=dtype).dtype for dtype in 

35 ["int8", "int16", "int32", "int64", 

36 "uint8", "uint16", "uint32", "uint64", 

37 "float32", "float64"] 

38] 

39 

40datetime_indexes = [ 

41 pd.DatetimeIndex 

42] 

43 

44 

45def index_is_numeric(index: pd.Index): 

46 """Check if pandas Index is numeric""" 

47 return isinstance(index, pd.RangeIndex) or index.dtype in numeric_index_dtypes 

48 

49 

50@pd.api.extensions.register_dataframe_accessor("tsd") 

51class TimeSeriesAccessor: 

52 """ 

53 Pandas DataFrame accessor for time series functionality. 

54 Access using df.tsd.* 

55 """ 

56 

57 def __init__(self, pandas_obj): 

58 self._obj = pandas_obj 

59 self._filepath = None 

60 

61 @property 

62 def filepath(self): 

63 """Get the filepath associated with the time series data""" 

64 return self._filepath 

65 

66 @filepath.setter 

67 def filepath(self, filepath): 

68 """Set the filepath associated with the time series data""" 

69 self._filepath = Path(filepath) if filepath else None 

70 

71 def save(self, filepath: str = None, **kwargs) -> None: 

72 """ 

73 Save the current time-series-data into the given file-format. 

74 Currently supported are .hdf, which is an easy and fast storage, 

75 and, .csv is supported as an easy-readable option. 

76 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME 

77 are supported. 

78 

79 :param str,os.path.normpath filepath: 

80 Filepath were to store the data. Either .hdf, .csv, .parquet 

81 or .parquet.COMPRESSION_NAME has to be the file-ending. 

82 Default is current filepath of class. 

83 :keyword str key: 

84 Necessary keyword-argument for saving a .hdf-file. 

85 Specifies the key of the table in the .hdf-file. 

86 :keyword str sep: 

87 Separator used for saving as .csv. Default is ','. 

88 :keyword str engine: 

89 Chose the engine for reading .parquet files. Default is 'pyarrow' 

90 Other option is 'fastparquet' (python>=3.9). 

91 """ 

92 # Set filepath if not given 

93 if filepath is None: 

94 if self.filepath is None: 

95 raise FileNotFoundError( 

96 "TimeSeriesData has neither a filepath stored in tsd " 

97 "accessor nor did you provide a filepath were to store the data." 

98 ) 

99 filepath = self.filepath 

100 else: 

101 filepath = Path(filepath) 

102 

103 # Check if filepath is still None 

104 if filepath is None: 

105 raise ValueError("No filepath specified and no default filepath is set.") 

106 

107 # Save based on file suffix 

108 if filepath.suffix == ".hdf": 

109 if "key" not in kwargs: 

110 raise KeyError("Argument 'key' must be specified to save a .hdf file") 

111 self._obj.to_hdf(filepath, key=kwargs.get("key")) 

112 elif filepath.suffix == ".csv": 

113 self._obj.to_csv(filepath, sep=kwargs.get("sep", ",")) 

114 elif ".parquet" in filepath.name: 

115 parquet_split = filepath.name.split(".parquet") 

116 self._obj.to_parquet( 

117 filepath, engine=kwargs.get('engine', 'pyarrow'), 

118 compression=parquet_split[-1][1:] if parquet_split[-1] else None, 

119 index=True 

120 ) 

121 else: 

122 raise TypeError("Given file-format is not supported." 

123 "You can only store time series data as .hdf, .csv, .parquet, " 

124 "and .parquet.COMPRESSION_NAME with additional compression options") 

125 

126 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace=True): 

127 """ 

128 Convert the current index to a datetime index using 

129 ebcpy.preprocessing.convert_index_to_datetime_index() 

130 

131 :param str unit_of_index: default 's' 

132 The unit of the given index. Used to convert to 

133 total_seconds later on. 

134 :param datetime.datetime origin: 

135 The reference datetime object for the first index. 

136 Default is the current system time. 

137 :param bool inplace: 

138 If True, performs operation inplace and returns None. 

139 :return: df 

140 Copy of DataFrame with correct index for usage in this 

141 framework. 

142 """ 

143 return preprocessing.convert_index_to_datetime_index( 

144 df=self._obj, 

145 unit_of_index=unit_of_index, 

146 origin=origin, 

147 inplace=inplace 

148 ) 

149 

150 def to_float_index(self, offset=0, inplace=True): 

151 """ 

152 Convert the current index to a float based index using 

153 ebcpy.preprocessing.convert_datetime_index_to_float_index() 

154 

155 :param float offset: 

156 Offset in seconds 

157 :param bool inplace: 

158 If True, performs operation inplace and returns None. 

159 :return: pd.DataFrame df: 

160 DataFrame with correct index. 

161 """ 

162 if not isinstance(self._obj.index, pd.DatetimeIndex): 

163 if inplace: 

164 return None 

165 return self._obj 

166 

167 return preprocessing.convert_datetime_index_to_float_index( 

168 df=self._obj, 

169 offset=offset, 

170 inplace=inplace 

171 ) 

172 

173 def clean_and_space_equally(self, desired_freq, inplace=True): 

174 """ 

175 Call to the preprocessing function 

176 ebcpy.preprocessing.clean_and_space_equally_time_series() 

177 See the docstring of this function to know what is happening. 

178 

179 :param str desired_freq: 

180 Frequency to determine number of elements in processed dataframe. 

181 Options are for example: 

182 - s: second-based 

183 - 5s: Every 5 seconds 

184 - 6min: Every 6 minutes 

185 This also works for h, d, m, y, ms etc. 

186 :param bool inplace: 

187 If True, performs operation inplace and returns None. 

188 :return: pd.DataFrame 

189 Cleaned and equally spaced data-frame 

190 """ 

191 df = preprocessing.clean_and_space_equally_time_series( 

192 df=self._obj, 

193 desired_freq=desired_freq 

194 ) 

195 if inplace: 

196 self._obj = df 

197 return None 

198 return df 

199 

200 def low_pass_filter(self, crit_freq, filter_order, variable): 

201 """ 

202 Call to the preprocessing function 

203 ebcpy.preprocessing.low_pass_filter() 

204 See the docstring of this function to know what is happening. 

205 

206 :param float crit_freq: 

207 The critical frequency or frequencies. 

208 :param int filter_order: 

209 The order of the filter 

210 :param str variable: 

211 The variable name to apply the filter to 

212 :return: numpy.ndarray 

213 Filtered data 

214 """ 

215 return preprocessing.low_pass_filter( 

216 data=self._obj[variable].to_numpy(), 

217 filter_order=filter_order, 

218 crit_freq=crit_freq 

219 ) 

220 

221 def moving_average(self, window, variable): 

222 """ 

223 Call to the preprocessing function 

224 ebcpy.preprocessing.moving_average() 

225 See the docstring of this function to know what is happening. 

226 

227 :param int window: 

228 sample rate of input 

229 :param str variable: 

230 The variable name to apply the filter to 

231 :return: numpy.ndarray 

232 Moving average result 

233 """ 

234 return preprocessing.moving_average( 

235 data=self._obj[variable].to_numpy(), 

236 window=window, 

237 ) 

238 

239 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]: 

240 """ 

241 Return an alphabetically sorted list of variable names, optionally filtered by patterns. 

242 

243 By default, returns all column names found in the DataFrame, sorted alphabetically. 

244 If `patterns` is provided, only names matching one or more of the given 

245 literal strings or glob-style patterns (where `*` matches any sequence of characters) 

246 will be returned. 

247 

248 :param patterns: 

249 - A single string or list of strings. 

250 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard. 

251 - If None, all variable names are returned. 

252 :return: 

253 A list of matching variable names, in alphabetical order. 

254 :raises KeyError: 

255 If any literal name or pattern does not match at least one variable in the DataFrame. 

256 

257 Example: 

258 # return all wall temperatures at any layer 

259 df.tsd.get_variable_names("*wall.layer[*].T") 

260 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"] 

261 """ 

262 all_names = sorted(self._obj.columns.get_level_values(0).unique()) 

263 if patterns is None: 

264 return all_names 

265 return get_names(all_names, patterns) 

266 

267 def number_lines_totally_na(self): 

268 """ 

269 Returns the number of rows in the given dataframe 

270 that are filled with NaN-values. 

271 """ 

272 return preprocessing.number_lines_totally_na(self._obj) 

273 

274 @property 

275 def frequency(self): 

276 """ 

277 The frequency of the time series data. 

278 Returns's the mean and the standard deviation of 

279 the index. 

280 

281 :returns: 

282 float: Mean value 

283 float: Standard deviation 

284 """ 

285 return preprocessing.get_df_index_frequency_mean_and_std( 

286 df_index=self._obj.index 

287 ) 

288 

289 

290class TimeSeriesData(pd.DataFrame): 

291 """ 

292 Most data related to energy and building 

293 climate related problems is time-variant. 

294 

295 Class for handling time series data using a pandas dataframe. 

296 This class works file-based and makes the import of different 

297 file-types into a pandas DataFrame more user-friendly. 

298 Furthermore, functions to support multi-indexing are provided to 

299 efficiently handle variable passed processing and provide easy 

300 visualization and preprocessing access. 

301 

302 :param str,os.path.normpath,pd.DataFrame data: 

303 Filepath ending with either .hdf, .mat, .csv, .parquet, 

304 or .parquet.COMPRESSION_NAME containing 

305 time-dependent data to be loaded as a pandas.DataFrame. 

306 Alternative option is to pass a DataFrame directly. 

307 :keyword str key: 

308 Name of the table in a .hdf-file if the file 

309 contains multiple tables. 

310 :keyword str sep: 

311 separator for the use of a csv file. If none is provided, 

312 a comma (",") is used as a default value. 

313 See pandas.read_csv() docs for further information. 

314 :keyword int, list header: 

315 Header columns for .csv files. 

316 See pandas.read_csv() docs for further information. 

317 Default is first row (0). 

318 :keyword int,str index_col: 

319 Column to be used as index in .csv files. 

320 See pandas.read_csv() docs for further information. 

321 Default is first column (0). 

322 :keyword str sheet_name: 

323 Name of the sheet you want to load data from. Required keyword 

324 argument when loading a xlsx-file. 

325 :keyword str default_tag: 

326 Which value to use as tag. Default is 'raw' 

327 :keyword str engine: 

328 Chose the engine for reading .parquet files. Default is 'pyarrow' 

329 Other option is 'fastparquet' (python>=3.9). 

330 :keyword list variable_names: 

331 List of variable names to load from .mat file. If you 

332 know which variables you want to plot, this may speed up 

333 loading significantly, and reduce memory size drastically. 

334 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.) 

335 to match multiple variables at once. 

336 

337 Examples: 

338 

339 First let's see the usage for a common dataframe. 

340 

341 >>> import numpy as np 

342 >>> from ebcpy import TimeSeriesData 

343 >>> tsd = TimeSeriesData({"my_variable": np.random.rand(5)}) 

344 >>> tsd.to_datetime_index() 

345 >>> tsd.save("my_new_data.csv") 

346 

347 Now, let's load the recently created file. 

348 

349 >>> tsd = TimeSeriesData("my_new_data.csv") 

350 """ 

351 

352 # normal properties 

353 _metadata = [ 

354 "_filepath", 

355 "_loader_kwargs", 

356 "_default_tag", 

357 "_multi_col_names" 

358 ] 

359 

360 def __init__(self, data: Union[str, Any], use_multicolumn: bool = False, **kwargs): 

361 """Initialize class-objects and check correct input.""" 

362 warnings.warn( 

363 "TimeSeriesData will be deprecated in the next major release. " 

364 "Instead, use 'load_time_series_data' to load files etc. as pd.DataFrame " 

365 "and use the 'tsd' accessor to access useful time-series-related functions " 

366 "as before with TimeSeriesData.", FutureWarning 

367 ) 

368 if use_multicolumn: 

369 warnings.warn( 

370 "All multicolumn support will be removed in the next major release", FutureWarning 

371 ) 

372 # Initialize as default 

373 self._filepath = None 

374 self._loader_kwargs = {} 

375 self._multi_col_names = ["Variables", "Tags"] 

376 self._default_tag = kwargs.pop("default_tag", "raw") 

377 if not isinstance(self._default_tag, str): 

378 raise TypeError(f"Invalid type for default_tag! Expected 'str' but " 

379 f"received {type(self._default_tag)}") 

380 

381 # Two possibles inputs. first argument is actually data provided by pandas 

382 # and kwargs hold further information or is it an actual filepath. 

383 if isinstance(data, BlockManager): 

384 super().__init__(data=data) 

385 return 

386 

387 if not isinstance(data, (str, Path)): 

388 _df_loaded = pd.DataFrame(data=data, 

389 index=kwargs.get("index", None), 

390 columns=kwargs.get("columns", None), 

391 dtype=kwargs.get("dtype", None), 

392 copy=kwargs.get("copy", False)) 

393 else: 

394 file = Path(data) 

395 self._loader_kwargs = kwargs.copy() 

396 _df_loaded = _load_df_from_file(file=file, **self._loader_kwargs) 

397 _df_loaded.tsd.filepath = file 

398 self._filepath = file 

399 

400 if _df_loaded.columns.nlevels == 1: 

401 # Check if first level is named Tags. 

402 # If so, don't create MultiIndex-DF as the method is called by the pd constructor 

403 if _df_loaded.columns.name != self._multi_col_names[1] and use_multicolumn: 

404 multi_col = pd.MultiIndex.from_product( 

405 [_df_loaded.columns, [self._default_tag]], 

406 names=self._multi_col_names 

407 ) 

408 _df_loaded.columns = multi_col 

409 

410 elif _df_loaded.columns.nlevels == 2: 

411 if _df_loaded.columns.names != self._multi_col_names and use_multicolumn: 

412 raise TypeError("Loaded dataframe has a different 2-Level " 

413 "header format than it is supported by this " 

414 "class. The names have to match.") 

415 else: 

416 raise TypeError("Only DataFrames with Multi-Columns with 2 " 

417 "Levels are supported by this class.") 

418 

419 super().__init__(_df_loaded) 

420 

421 @property 

422 def _constructor(self): 

423 """Overwrite constructor method according to: 

424 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

425 return TimeSeriesData 

426 

427 @property 

428 def _constructor_sliced(self): 

429 """Overwrite constructor method according to: 

430 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

431 return TimeSeries 

432 

433 @property 

434 def filepath(self) -> str: 

435 """Get the filepath associated with the time series data""" 

436 return self._filepath 

437 

438 @filepath.setter 

439 def filepath(self, filepath: str): 

440 """Set the filepath associated with the time series data""" 

441 self._filepath = Path(filepath) 

442 self.tsd.filepath = self._filepath 

443 

444 @property 

445 def default_tag(self) -> str: 

446 """Get the default of time series data object""" 

447 return self._default_tag 

448 

449 @default_tag.setter 

450 def default_tag(self, tag: str) -> None: 

451 """Set the default_tag of the time series data object 

452 :param tag: new tag 

453 :type tag: String 

454 """ 

455 if not isinstance(tag, str): 

456 raise TypeError(f"Invalid type for default_tag! Expected 'str' but " 

457 f"received {type(tag)}") 

458 if tag not in self.get_tags(): 

459 raise KeyError(f"Tag '{tag}' does not exist for current data set!" 

460 f"\n Available tags: {self.get_tags()}") 

461 self._default_tag = tag 

462 

463 def save(self, filepath: str = None, **kwargs) -> None: 

464 """ 

465 Save the current time-series-data into the given file-format. 

466 Currently supported are .hdf, which is an easy and fast storage, 

467 and, .csv is supported as an easy-readable option. 

468 Also, .parquet, and with additional compression .parquet.COMPRESSION_NAME 

469 are supported. Compressions could be gzip, brotli or snappy. For all possible 

470 compressions see the documentation of the parquet engines. 

471 For a small comparison of these data formats see https://github.com/RWTH-EBC/ebcpy/issues/81 

472 

473 :param str,os.path.normpath filepath: 

474 Filepath were to store the data. Either .hdf, .csv, .parquet 

475 or .parquet.COMPRESSION_NAME has to be the file-ending. 

476 Default is current filepath of class. 

477 :keyword str key: 

478 Necessary keyword-argument for saving a .hdf-file. 

479 Specifies the key of the table in the .hdf-file. 

480 :keyword str sep: 

481 Separator used for saving as .csv. Default is ','. 

482 :keyword str engine: 

483 Chose the engine for reading .parquet files. Default is 'pyarrow' 

484 Other option is 'fastparquet' (python>=3.9). 

485 :return: 

486 """ 

487 # If new settings are needed, update existing ones 

488 self._loader_kwargs.update(kwargs) 

489 self.tsd.save(filepath, **kwargs) 

490 

491 def to_df(self, force_single_index=False): 

492 """ 

493 Return the dataframe version of the current TimeSeriesData object. 

494 If all tags are equal, the tags are dropped. 

495 Else, the object is just converted. 

496 

497 :param bool force_single_index: 

498 If True (not the default), the conversion to a standard 

499 DataFrame with a single index column (only variable names) 

500 is only done if no variable contains multiple tags. 

501 """ 

502 if len(self.get_variables_with_multiple_tags()) == 0: 

503 if self._is_old_multicolumn_format: 

504 return pd.DataFrame(self.droplevel(1, axis=1)) 

505 return pd.DataFrame(self) 

506 if force_single_index: 

507 raise IndexError( 

508 "Can't automatically drop all tags " 

509 "as the following variables contain multiple tags: " 

510 f"{' ,'.join(self.get_variables_with_multiple_tags())}. " 

511 ) 

512 return pd.DataFrame(self) 

513 

514 def get_variable_names(self, patterns: Union[str, List[str]] = None) -> List[str]: 

515 """ 

516 Return an alphabetically sorted list of variable names, optionally filtered by patterns. 

517 

518 By default, returns all variable names found in the first level of the DataFrame's 

519 column MultiIndex, sorted alphabetically. If `patterns` is provided, only names 

520 matching one or more of the given literal strings or glob-style patterns 

521 (where `*` matches any sequence of characters) will be returned. 

522 

523 :param patterns: 

524 - A single string or list of strings. 

525 - Each entry may be an exact variable name, or a pattern containing `*` as a wildcard. 

526 - If None, all variable names are returned. 

527 :return: 

528 A list of matching variable names, in alphabetical order. 

529 :raises KeyError: 

530 If any literal name or pattern does not match at least one variable in the DataFrame. 

531 

532 Example: 

533 # return all wall temperatures at any layer 

534 tsd.get_variable_names("*wall.layer[*].T") 

535 ["wall.layer[1].T", "wall.layer[2].T", "wall.layer[3].T"] 

536 """ 

537 return self.tsd.get_variable_names(patterns) 

538 

539 def get_variables_with_multiple_tags(self) -> List[str]: 

540 """ 

541 Return an alphabetically sorted list of all variables 

542 that contain more than one tag. 

543 

544 :return: List[str] 

545 """ 

546 var_names = self.columns.get_level_values(0) 

547 return sorted(var_names[var_names.duplicated()]) 

548 

549 def get_tags(self, variable: str = None) -> List[str]: 

550 """ 

551 Return an alphabetically sorted list of all tags 

552 

553 :param str variable: 

554 If given, tags of this variable are returned 

555 

556 :return: List[str] 

557 """ 

558 if not self._is_old_multicolumn_format: 

559 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!") 

560 if variable: 

561 tags = self.loc[:, variable].columns 

562 return sorted(tags) 

563 return sorted(self.columns.get_level_values(1).unique()) 

564 

565 @property 

566 def _is_old_multicolumn_format(self): 

567 """ 

568 Helper function to check if the old multicolumn format is used. 

569 """ 

570 return isinstance(self.columns, pd.MultiIndex) 

571 

572 def get_columns_by_tag(self, 

573 tag: str, 

574 variables: list = None, 

575 return_type: str = 'pandas', 

576 drop_level: bool = False): 

577 """ 

578 Returning all columns with defined tag in the form of ndarray. 

579 

580 :param str tag: 

581 Define the tag which return columns have to 

582 match. 

583 :param list variables: 

584 Besides the given tag, specify the 

585 variables names matching the return criteria as well. 

586 :param boolean drop_level: 

587 If tag should be included in the response. 

588 Default is True. 

589 :param str return_type: 

590 Return format. Options are: 

591 - pandas (pd.series) 

592 - numpy, scipy, sp, and np (np.array) 

593 - control (transposed np.array) 

594 :return: ndarray of input signals 

595 """ 

596 if not self._is_old_multicolumn_format: 

597 raise KeyError("You can't get tags for a TimeSeriesData object created with use_multicolumn=False!") 

598 

599 # Extract columns 

600 if variables: 

601 _ret = self.loc[:, variables] 

602 else: 

603 _ret = self 

604 

605 _ret = _ret.xs(tag, axis=1, level=1, drop_level=drop_level) 

606 

607 # Return based on the given return_type 

608 if return_type.lower() == 'pandas': 

609 return _ret 

610 if return_type.lower() in ['numpy', 'scipy', 'sp', 'np']: 

611 return _ret.to_numpy() 

612 if return_type.lower() == 'control': 

613 return _ret.to_numpy().transpose() 

614 raise TypeError("Unknown return type") 

615 

616 def to_datetime_index(self, unit_of_index="s", origin=datetime.now(), inplace: bool = True): 

617 """ 

618 Convert the current index to a float based index using 

619 ebcpy.preprocessing.convert_index_to_datetime_index() 

620 

621 :param str unit_of_index: default 's' 

622 The unit of the given index. Used to convert to 

623 total_seconds later on. 

624 :param datetime.datetime origin: 

625 The reference datetime object for the first index. 

626 Default is the current system time. 

627 :param bool inplace: 

628 If True, performs operation inplace and returns None. 

629 :return: df 

630 Copy of DataFrame with correct index for usage in this 

631 framework. 

632 

633 """ 

634 return self.tsd.to_datetime_index(unit_of_index, origin, inplace) 

635 

636 def to_float_index(self, offset=0, inplace: bool = True): 

637 """ 

638 Convert the current index to a float based index using 

639 ebcpy.preprocessing.convert_datetime_index_to_float_index() 

640 

641 :param float offset: 

642 Offset in seconds 

643 :param bool inplace: 

644 If True, performs operation inplace and returns None. 

645 :return: pd.DataFrame df: 

646 DataFrame with correct index. 

647 """ 

648 return self.tsd.to_float_index(offset, inplace) 

649 

650 def clean_and_space_equally(self, desired_freq, inplace: bool = True): 

651 """ 

652 Call to the preprocessing function 

653 ebcpy.preprocessing.clean_and_space_equally_time_series() 

654 See the docstring of this function to know what is happening. 

655 

656 :param str desired_freq: 

657 Frequency to determine number of elements in processed dataframe. 

658 Options are for example: 

659 - s: second-based 

660 - 5s: Every 5 seconds 

661 - 6min: Every 6 minutes 

662 This also works for h, d, m, y, ms etc. 

663 :param bool inplace: 

664 If True, performs operation inplace and returns None. 

665 :return: pd.DataFrame 

666 Cleaned and equally spaced data-frame 

667 """ 

668 return self.tsd.clean_and_space_equally(desired_freq, inplace) 

669 

670 def low_pass_filter(self, crit_freq, filter_order, variable, 

671 tag=None, new_tag="low_pass_filter"): 

672 """ 

673 Call to the preprocessing function 

674 ebcpy.preprocessing.low_pass_filter() 

675 See the docstring of this function to know what is happening. 

676 If the old multicolumn format is used, the result is stored in the 

677 multicolumn header with the `new_tag`. 

678 

679 :param float crit_freq: 

680 The critical frequency or frequencies. 

681 :param int filter_order: 

682 The order of the filter 

683 :param str variable: 

684 The variable name to apply the filter to 

685 :param str tag: 

686 If this variable has more than one tag, specify which one 

687 :param str new_tag: 

688 The new tag to pass to the variable. 

689 Default is 'low_pass_filter' 

690 """ 

691 result = self.tsd.low_pass_filter(crit_freq, filter_order, self._possibly_get_variable_and_tag(variable, tag)) 

692 if self._is_old_multicolumn_format: 

693 self.loc[:, (variable, new_tag)] = result 

694 else: 

695 return result 

696 

697 def moving_average(self, window, variable, 

698 tag=None, new_tag="moving_average"): 

699 """ 

700 Call to the preprocessing function 

701 ebcpy.preprocessing.moving_average() 

702 See the docstring of this function to know what is happening. 

703 If the old multicolumn format is used, the result is stored in the 

704 multicolumn header with the `new_tag`. 

705 

706 :param int window: 

707 sample rate of input 

708 :param str variable: 

709 The variable name to apply the filter to 

710 :param str tag: 

711 If this variable has more than one tag, specify which one 

712 :param str new_tag: 

713 The new tag to pass to the variable. 

714 Default is 'low_pass_filter' 

715 """ 

716 result = self.tsd.moving_average(window, self._possibly_get_variable_and_tag(variable, tag)) 

717 if self._is_old_multicolumn_format: 

718 self.loc[:, (variable, new_tag)] = result 

719 else: 

720 return result 

721 

722 def _possibly_get_variable_and_tag(self, variable: str, tag: str = None): 

723 """ 

724 Helper function to get numpy array based on variable and possible tag name, 

725 depending on whether multicolumn is used or not. 

726 

727 :param str variable: 

728 The variable name to apply the filter to 

729 :param str tag: 

730 If this variable has more than one tag, specify which one 

731 

732 """ 

733 if tag is None: 

734 return variable 

735 if self._is_old_multicolumn_format: 

736 return (variable, tag) 

737 return variable 

738 

739 def number_lines_totally_na(self): 

740 """ 

741 Returns the number of rows in the given dataframe 

742 that are filled with NaN-values. 

743 """ 

744 return self.tsd.number_lines_totally_na() 

745 

746 @property 

747 def frequency(self): 

748 """ 

749 The frequency of the time series data. 

750 Returns's the mean and the standard deviation of 

751 the index. 

752 

753 :returns: 

754 float: Mean value 

755 float: Standard deviation 

756 """ 

757 return self.tsd.frequency 

758 

759 

760class TimeSeries(pd.Series): 

761 """Overwrites pd.Series to enable correct slicing 

762 and expansion in the TimeSeriesData class 

763 

764 .. versionadded:: 0.1.7 

765 """ 

766 

767 @property 

768 def _constructor(self): 

769 """Overwrite constructor method according to: 

770 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

771 return TimeSeries 

772 

773 @property 

774 def _constructor_expanddim(self): 

775 """Overwrite constructor method according to: 

776 https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas""" 

777 return TimeSeriesData 

778 

779 

780def get_keys_of_hdf_file(filepath): 

781 """ 

782 Find all keys in a given hdf-file. 

783 

784 :param str,os.path.normpath filepath: 

785 Path to the .hdf-file 

786 :return: list 

787 List with all keys in the given file. 

788 """ 

789 # pylint: disable=import-outside-toplevel 

790 try: 

791 import h5py 

792 with h5py.File(filepath, 'r') as hdf_file: 

793 return list(hdf_file.keys()) 

794 except ImportError: 

795 return ["ERROR: Could not obtain keys as h5py is not installed"] 

796 

797 

798def load_time_series_data(data: Union[str, Any], **kwargs) -> pd.DataFrame: 

799 """ 

800 Load time series data from various sources into a pandas DataFrame with 

801 custom time series accessor methods available via .tsd property. 

802 

803 :param str,os.path.normpath,pd.DataFrame data: 

804 Filepath ending with either .hdf, .mat, .csv, .parquet, 

805 or .parquet.COMPRESSION_NAME containing 

806 time-dependent data to be loaded as a pandas.DataFrame. 

807 Alternative option is to pass a DataFrame directly. 

808 :keyword str key: 

809 Name of the table in a .hdf-file if the file 

810 contains multiple tables. 

811 :keyword str sep: 

812 separator for the use of a csv file. If none is provided, 

813 a comma (",") is used as a default value. 

814 See pandas.read_csv() docs for further information. 

815 :keyword int, list header: 

816 Header columns for .csv files. 

817 See pandas.read_csv() docs for further information. 

818 Default is first row (0). 

819 :keyword int,str index_col: 

820 Column to be used as index in .csv files. 

821 See pandas.read_csv() docs for further information. 

822 Default is first column (0). 

823 :keyword str sheet_name: 

824 Name of the sheet you want to load data from. Required keyword 

825 argument when loading a xlsx-file. 

826 :keyword str engine: 

827 Chose the engine for reading .parquet files. Default is 'pyarrow' 

828 Other option is 'fastparquet' (python>=3.9). 

829 :keyword list variable_names: 

830 List of variable names to load from .mat file. If you 

831 know which variables you want to plot, this may speed up 

832 loading significantly, and reduce memory size drastically. 

833 You can also supply wildcard patterns (e.g. "*wall.layer[*].T", etc.) 

834 to match multiple variables at once. 

835 :return: pd.DataFrame 

836 DataFrame with custom .tsd accessor containing time series functionality 

837 

838 Examples: 

839 

840 Create a DataFrame with random data: 

841 

842 >>> import numpy as np 

843 >>> from ebcpy import load_time_series_data 

844 >>> df = load_time_series_data({"my_variable": np.random.rand(5)}) 

845 >>> df.tsd.to_datetime_index() 

846 >>> df.tsd.save("my_new_data.csv") 

847 

848 Now, let's load the recently created file: 

849 

850 >>> df = load_time_series_data("my_new_data.csv") 

851 """ 

852 if isinstance(data, pd.DataFrame): 

853 df = data.copy() 

854 elif not isinstance(data, (str, Path)): 

855 df = pd.DataFrame(data=data, 

856 index=kwargs.get("index", None), 

857 columns=kwargs.get("columns", None), 

858 dtype=kwargs.get("dtype", None), 

859 copy=kwargs.get("copy", False)) 

860 else: 

861 # Load from file 

862 file = Path(data) 

863 df = _load_df_from_file(file=file, **kwargs) 

864 df.tsd.filepath = file 

865 

866 return df 

867 

868 

869def _load_df_from_file(file, **kwargs): 

870 """ 

871 Function to load a given filepath into a dataframe 

872 

873 :param Path file: File path to load 

874 :param kwargs: Additional loading parameters 

875 :return: pd.DataFrame 

876 """ # Check whether the file exists 

877 if not os.path.isfile(file): 

878 raise FileNotFoundError( 

879 f"The given filepath {file} could not be opened") 

880 

881 # Open based on file suffix. 

882 # Currently, hdf, csv, and Modelica result files (mat) are supported. 

883 if file.suffix == ".hdf": 

884 # Load the current file as a hdf to a dataframe. 

885 # As specifying the key can be a problem, the user will 

886 # get all keys of the file if one is necessary but not provided. 

887 key = kwargs.get("key") 

888 if key == "": 

889 key = None # Avoid cryptic error in pandas by converting empty string to None 

890 try: 

891 df = pd.read_hdf(file, key=key) 

892 except (ValueError, KeyError) as error: 

893 keys = ", ".join(get_keys_of_hdf_file(file)) 

894 raise KeyError(f"key must be provided when HDF5 file contains multiple datasets. " 

895 f"Here are all keys in the given hdf-file: {keys}") from error 

896 elif file.suffix == ".csv": 

897 # Check if file was previously a TimeSeriesData object 

898 with open(file, "r") as _f: 

899 lines = [_f.readline() for _ in range(2)] 

900 # Backwards compatible assumption: Users never changed '_multi_col_names' 

901 if (lines[0].startswith("Variables") and 

902 lines[1].startswith("Tags")): 

903 _hea_def = [0, 1] 

904 else: 

905 _hea_def = 0 

906 

907 df = pd.read_csv( 

908 file, 

909 sep=kwargs.get("sep", ","), 

910 index_col=kwargs.get("index_col", 0), 

911 header=kwargs.get("header", _hea_def) 

912 ) 

913 elif file.suffix == ".mat": 

914 df = sr.mat_to_pandas( 

915 fname=file, 

916 with_unit=False, 

917 names=kwargs.get("variable_names") 

918 ) 

919 elif file.suffix in ['.xlsx', '.xls', '.odf', '.ods', '.odt']: 

920 sheet_name = kwargs.get("sheet_name") 

921 if sheet_name is None: 

922 raise KeyError("sheet_name is a required keyword argument to load xlsx-files." 

923 "Please pass a string to specify the name " 

924 "of the sheet you want to load.") 

925 df = pd.read_excel(io=file, sheet_name=sheet_name) 

926 elif ".parquet" in file.name: 

927 df = pd.read_parquet(path=file, engine=kwargs.get('engine', 'pyarrow')) 

928 else: 

929 raise TypeError("Only .hdf, .csv, .xlsx and .mat are supported!") 

930 if not isinstance(df.index, tuple(datetime_indexes)) and not index_is_numeric(df.index): 

931 try: 

932 df.index = pd.DatetimeIndex(df.index) 

933 except Exception as err: 

934 raise IndexError( 

935 f"Given data has index of type {type(df.index)}. " 

936 f"Currently only numeric indexes and the following are supported:" 

937 f"{' ,'.join([str(idx) for idx in [pd.RangeIndex] + datetime_indexes])} " 

938 f"Automatic conversion to pd.DateTimeIndex failed" 

939 f"see error above." 

940 ) from err 

941 return df