Coverage for ebcpy/preprocessing.py: 92%

177 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-08-26 09:12 +0000

1""" 

2This general overview may help you find the function you need: 

3 

4- Remove duplicate rows by averaging the values 

5 (``build_average_on_duplicate_rows``) 

6- Convert any integer or float index into a datetime index 

7 (``convert_index_to_datetime_index``) 

8- Resample a given time-series on a given frequency 

9 (``clean_and_space_equally_time_series``) 

10- Apply a low-pass-filter (``low_pass_filter``) 

11- Apply a moving average to flatten disturbances 

12 in your measured data (``moving_average``) 

13- Convert e.g. an electrical power signal into a binary 

14 control signal (on-off) based on a threshold (``create_on_off_signal``) 

15- Find the number of lines without any values in it (``number_lines_totally_na``) 

16- Split a data-set into training and test set according to 

17 cross-validation (``cross_validation``) 

18 

19All functions in the pre-processing module should have a doctest. We refer to the example 

20in this doctest for a better understanding of the functions. If you don't understand 

21the behaviour of a function or the meaning, please raise an issue. 

22""" 

23import warnings 

24import logging 

25from typing import Union, TYPE_CHECKING 

26 

27from datetime import datetime 

28from scipy import signal 

29from sklearn import model_selection 

30from pandas.tseries.frequencies import to_offset 

31import numpy as np 

32import pandas as pd 

33import scipy.stats as st 

34 

35if TYPE_CHECKING: 

36 from ebcpy import TimeSeriesData 

37 

38logger = logging.getLogger(__name__) 

39 

40 

41def build_average_on_duplicate_rows(df: Union[pd.DataFrame, "TimeSeriesData"]) -> pd.DataFrame: 

42 """ 

43 If the dataframe has duplicate-indexes, the average 

44 value of all those indexes is calculated and given to 

45 the first occurrence of this duplicate index. Therefore, 

46 any dataFrame should be already sorted before calling this 

47 function. 

48 

49 :param pd.DataFame df: 

50 DataFrame with the data to process 

51 :return: pd.DataFame 

52 The processed DataFame 

53 

54 Example: 

55 

56 >>> df = pd.DataFrame({"idx": np.ones(5), "val": np.arange(5)}).set_index("idx") 

57 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)) 

58 >>> print(df) 

59 val 

60 idx 

61 2007-01-01 00:00:01 0 

62 2007-01-01 00:00:01 1 

63 2007-01-01 00:00:01 2 

64 2007-01-01 00:00:01 3 

65 2007-01-01 00:00:01 4 

66 >>> print(build_average_on_duplicate_rows(df)) 

67 val 

68 idx 

69 2007-01-01 00:00:01 2.0 

70 """ 

71 # Find entries that are exactly the same timestamp 

72 double_ind = df.index[df.index.duplicated()].unique() 

73 # Calculate the mean value 

74 mean_values = [] 

75 for item in double_ind: 

76 mean_values.append(df.loc[item].values.mean(axis=0)) 

77 # Delete duplicate indices 

78 df_dropped = df[~df.index.duplicated(keep='first')].copy() 

79 

80 # Set mean values in rows that were duplicates before 

81 for idx, values in zip(double_ind, mean_values): 

82 df_dropped.loc[idx] = values 

83 

84 return df_dropped 

85 

86 

87def convert_index_to_datetime_index( 

88 df: Union[pd.DataFrame, "TimeSeriesData"], 

89 unit_of_index: str = "s", 

90 origin: datetime = datetime.now(), 

91 inplace: bool = False 

92) -> pd.DataFrame: 

93 """ 

94 Converts the index of the given DataFrame to a 

95 pandas.core.indexes.datetimes.DatetimeIndex. 

96 

97 :param pd.DataFrame,TimeSeriesData df: 

98 dataframe with index not being a DateTime. 

99 Only numeric indexes are supported. Every integer 

100 is interpreted with the given unit, standard form 

101 is in seocnds. 

102 :param str unit_of_index: default 's' 

103 The unit of the given index. Used to convert to 

104 total_seconds later on. 

105 :param datetime.datetime origin: 

106 The reference datetime object for the first index. 

107 Default is the current system time. 

108 :param bool inplace: 

109 If True, performs operation inplace and returns None. 

110 :return: df 

111 Copy of DataFrame with correct index for usage in this 

112 framework. 

113 

114 Example: 

115 

116 >>> import pandas as pd 

117 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD')) 

118 >>> print(df) 

119 A B C D 

120 0 1.0 1.0 1.0 1.0 

121 1 1.0 1.0 1.0 1.0 

122 2 1.0 1.0 1.0 1.0 

123 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))) 

124 A B C D 

125 2007-01-01 00:00:00 1.0 1.0 1.0 1.0 

126 2007-01-01 00:00:01 1.0 1.0 1.0 1.0 

127 2007-01-01 00:00:02 1.0 1.0 1.0 1.0 

128 

129 """ 

130 # Check for unit of given index. Maybe one uses hour-based data. 

131 _unit_conversion_to_seconds = {"ms": 1e3, 

132 "s": 1, 

133 "min": 1 / 60, 

134 "h": 1 / 3600, 

135 "d": 1 / 86400} 

136 if unit_of_index not in _unit_conversion_to_seconds: 

137 raise ValueError("Given unit_of_index is not supported.") 

138 _unit_factor_to_seconds = _unit_conversion_to_seconds.get(unit_of_index) 

139 

140 # Convert 

141 old_index = df.index.copy() 

142 # Check if already converted: 

143 if isinstance(old_index, pd.DatetimeIndex): 

144 return df 

145 # Convert strings to numeric values. 

146 old_index = pd.to_numeric(old_index) 

147 # Convert to seconds. 

148 old_index /= _unit_factor_to_seconds 

149 # Alter the index 

150 index = pd.to_datetime(old_index, unit="s", origin=origin) 

151 if inplace: 

152 df.index = index 

153 return None 

154 df_copy = df.copy() 

155 df_copy.index = index 

156 return df_copy 

157 

158 

159def convert_datetime_index_to_float_index( 

160 df: Union[pd.DataFrame, "TimeSeriesData"], 

161 offset: float = 0, 

162 inplace: bool = False 

163) -> pd.DataFrame: 

164 """ 

165 Convert a datetime-based index to FloatIndex (in seconds). 

166 Seconds are used as a standard unit as simulation software 

167 outputs data in seconds (e.g. Modelica) 

168 

169 :param pd.DataFrame,TimeSeriesData df: 

170 DataFrame to be converted to FloatIndex 

171 :param float offset: 

172 Offset in seconds 

173 :param bool inplace: 

174 If True, performs operation inplace and returns None. 

175 :return: pd.DataFrame df: 

176 DataFrame with correct index 

177 

178 Example: 

179 

180 >>> import pandas as pd 

181 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD')) 

182 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))) 

183 A B C D 

184 2007-01-01 00:00:00 1.0 1.0 1.0 1.0 

185 2007-01-01 00:00:01 1.0 1.0 1.0 1.0 

186 2007-01-01 00:00:02 1.0 1.0 1.0 1.0 

187 >>> print(convert_datetime_index_to_float_index(df)) 

188 A B C D 

189 0.0 1.0 1.0 1.0 1.0 

190 1.0 1.0 1.0 1.0 1.0 

191 2.0 1.0 1.0 1.0 1.0 

192 """ 

193 # Check correct input 

194 if not isinstance(df.index, pd.DatetimeIndex): 

195 raise IndexError("Given DataFrame has no DatetimeIndex, conversion not possible") 

196 

197 new_index = np.round(pd.to_timedelta(df.index - df.index[0]).total_seconds(), 4) + offset 

198 if inplace: 

199 df.index = new_index 

200 return None 

201 df_copy = df.copy() 

202 df_copy.index = new_index 

203 return df_copy 

204 

205 

206def time_based_weighted_mean(df: Union[pd.DataFrame, "TimeSeriesData"]) -> np.ndarray: 

207 """ 

208 Creates the weighted mean according to time index that does not need to be equidistant. 

209 Further info: 

210 https://stackoverflow.com/questions/26343252/create-a-weighted-mean-for-a-irregular-timeseries-in-pandas 

211 

212 :param pd.DataFrame df: 

213 A pandas DataFrame with DatetimeIndex. 

214 :return np.array: 

215 A numpy array containing weighted means of all columns 

216 

217 Example: 

218 

219 >>> from datetime import datetime 

220 >>> import numpy as np 

221 >>> import pandas as pd 

222 >>> time_vec = [datetime(2007,1,1,0,0), 

223 >>> datetime(2007,1,1,0,0), 

224 >>> datetime(2007,1,1,0,5), 

225 >>> datetime(2007,1,1,0,7), 

226 >>> datetime(2007,1,1,0,10)] 

227 >>> df = pd.DataFrame({'A': [1,2,4,3,6], 'B': [11,12,14,13,16]}, index=time_vec) 

228 >>> print(time_based_weighted_mean(df=df)) 

229 [ 3.55 13.55] 

230 """ 

231 

232 if not isinstance(df.index, pd.DatetimeIndex): 

233 raise IndexError(f"df.index must be DatetimeIndex, but it is {type(df.index)}.") 

234 

235 time_delta = [(x - y).total_seconds() for x, y in zip(df.index[1:], df.index[:-1])] 

236 weights = [x + y for x, y in zip([0] + time_delta, time_delta + [0])] 

237 # Create empty numpy array 

238 res = np.empty(len(df.columns)) 

239 res[:] = np.nan 

240 for i, col_name in enumerate(df.columns): 

241 res[i] = np.average(df[col_name], weights=weights) 

242 return res 

243 

244 

245def clean_and_space_equally_time_series( 

246 df: Union[pd.DataFrame, "TimeSeriesData"], 

247 desired_freq: str, 

248 confidence_warning: float = 0.95 

249) -> pd.DataFrame: 

250 """ 

251 Function for cleaning of the given dataFrame and interpolating 

252 based on the given desired frequency. Linear interpolation 

253 is used. 

254 

255 :param pd.DataFrame,TimeSeriesData df: 

256 Unclean DataFrame. Needs to have a pd.DateTimeIndex 

257 :param str desired_freq: 

258 Frequency to determine number of elements in processed dataframe. 

259 Options are for example: 

260 - s: second-based 

261 - 5s: Every 5 seconds 

262 - 6min: Every 6 minutes 

263 This also works for h, d, m, y, ms etc. 

264 :param float confidence_warning: 

265 Value to check the confidence interval of input data without 

266 a defined frequency. If the desired frequency is outside of 

267 the resulting confidence interval, a warning is issued. 

268 :return: pd.DataFrame 

269 Cleaned and equally spaced data-frame 

270 

271 Example: 

272 **Note:** The example is for random data. Try out different sampling 

273 frequencys. You will be warned if the samping rate is to high or to low. 

274 

275 >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), 

276 >>> columns=list('ABCD')).set_index("A").sort_index() 

277 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)) 

278 >>> clean_and_space_equally_time_series(df, "30s") 

279 >>> import matplotlib.pyplot as plt 

280 >>> plt.plot(df["B"], label="Raw data") 

281 >>> df = clean_and_space_equally_time_series(df.copy(), "1500ms") 

282 >>> plt.plot(df["B"], label="Clead and spaced equally") 

283 >>> plt.legend() 

284 >>> plt.show() 

285 

286 .. versionchanged:: 0.1.7 

287 """ 

288 from ebcpy import TimeSeriesData 

289 

290 # Convert indexes to datetime_index: 

291 if not isinstance(df.index, pd.DatetimeIndex): 

292 if isinstance(df, TimeSeriesData): 

293 raise TypeError("TimeSeriesData needs a DateTimeIndex for executing this function. " 

294 "Call to_datetime_index() to convert any index to " 

295 "a DateTimeIndex") 

296 # Else 

297 raise TypeError("DataFrame needs a DateTimeIndex for executing this function. " 

298 "Call convert_index_to_datetime_index() to convert any index to " 

299 "a DateTimeIndex") 

300 # %% Check DataFrame for NANs 

301 # Create a pandas Series with number of invalid values for each column of df 

302 series_with_na = df.isnull().sum() 

303 for name in series_with_na.index: 

304 if series_with_na.loc[name] > 0: 

305 # Print only columns with invalid values 

306 logger.info("%s has following number of invalid " 

307 "values\n %s", name, series_with_na.loc[name]) 

308 # Drop all rows where at least one NA exists 

309 df_temp = df.dropna(how='any') 

310 

311 # Check if DataFrame still has non-numeric-values: 

312 if not all(df_temp.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())): 

313 raise ValueError("Given DataFrame contains non-numeric values.") 

314 

315 # Merge duplicate rows using mean. 

316 df_temp = build_average_on_duplicate_rows(df_temp) 

317 

318 # Make user warning for two cases: Upsampling and data input without a freq: 

319 # Check if the frequency differs 

320 old_freq, old_freq_std, old_freq_sem, time_steps = get_df_index_frequency_mean_and_std( 

321 df_index=df_temp.index, 

322 verbose=True) 

323 if old_freq_std > 0: 

324 _ns_to_s = 1e9 

325 # Calculate confidence interval of the mean value of the old frequency 

326 cfd_int = st.t.interval(confidence_warning, 

327 time_steps - 1, 

328 loc=old_freq, 

329 scale=old_freq_sem) 

330 # Convert to timedelta 

331 cfd_int = pd.to_timedelta((cfd_int[0] * _ns_to_s, cfd_int[1] * _ns_to_s)) 

332 _td_freq = pd.to_timedelta(desired_freq) 

333 if (_td_freq < cfd_int[0]) or (_td_freq > cfd_int[1]): 

334 in_seconds = np.array(cfd_int.values.tolist()) / _ns_to_s # From nanoseconds 

335 warnings.warn(f"Input data has no frequency, but the desired frequency " 

336 f"{_td_freq.value / _ns_to_s} seconds is outside the given " 

337 f"confidence interval {in_seconds} (in seconds) " 

338 "Carefully check the result to see if you " 

339 "introduced errors to the data.") 

340 

341 # %% Re-sampling to new frequency with linear interpolation 

342 # Create new equally spaced DatetimeIndex. Last entry is always < df.index[-1] 

343 time_index = pd.date_range(start=df.index[0], end=df.index[-1], freq=desired_freq) 

344 new_freq, _ = get_df_index_frequency_mean_and_std(df_index=time_index) 

345 

346 # Check if the user is trying to upsample the data: 

347 if old_freq_std == 0: 

348 if new_freq > old_freq: 

349 warnings.warn("You are upsampling your data. This may be dangerous. " 

350 "Carefully check the result to see if you introduced errors to the data.") 

351 

352 # Create an empty data frame 

353 # If multi-columns is used, first get the old index and make it empty: 

354 multi_cols = df_temp.columns 

355 if isinstance(multi_cols, pd.MultiIndex): 

356 empty_multi_cols = pd.MultiIndex.from_product([[] for _ in range(multi_cols.nlevels)], 

357 names=multi_cols.names) 

358 df_time_temp = pd.DataFrame(index=time_index, columns=empty_multi_cols) 

359 else: 

360 df_time_temp = pd.DataFrame(index=time_index) 

361 

362 # Insert temporary time_index into df. fill_value = 0 can only be used, 

363 # since all NaNs should be eliminated prior 

364 df_temp = df_temp.radd(df_time_temp, axis='index', fill_value=0) 

365 del df_time_temp 

366 

367 # Interpolate linearly according to time index 

368 df_temp.interpolate(method='time', axis=0, inplace=True) 

369 # Determine Timedelta between current first index entry 

370 # in df and the first index entry that would be created 

371 # when applying df.resample() without loffset 

372 delta_time = df.index[0] - \ 

373 df_temp.resample(rule=desired_freq).first().first(desired_freq).index[0] 

374 # Resample to equally spaced index. 

375 # All fields should already have a value. Thus NaNs and maybe +/- infs 

376 # should have been filtered beforehand. 

377 

378 # Check if given dataframe was a TimeSeriesData object and of so, convert it as such 

379 from ebcpy import TimeSeriesData 

380 if isinstance(df_temp, TimeSeriesData): 

381 df_temp = df_temp.resample(rule=desired_freq).first() 

382 df_temp.index = df_temp.index + to_offset(delta_time) 

383 df_temp = TimeSeriesData(df_temp) 

384 else: 

385 df_temp = df_temp.resample(rule=desired_freq).first() 

386 df_temp.index = df_temp.index + to_offset(delta_time) 

387 del delta_time 

388 

389 return df_temp 

390 

391 

392def low_pass_filter(data: np.ndarray, crit_freq: float, filter_order: int) -> np.ndarray: 

393 """ 

394 Create a low pass filter with given order and frequency. 

395 

396 :param numpy.ndarray data: 

397 For dataframe e.g. df['a_col_name'].values 

398 :param float crit_freq: 

399 The critical frequency or frequencies. 

400 :param int filter_order: 

401 The order of the filter 

402 :return: numpy.ndarray 

403 

404 Example: 

405 

406 >>> import numpy as np 

407 >>> import matplotlib.pyplot as plt 

408 >>> rand_series = np.random.rand(100) 

409 >>> plt.plot(rand_series, label="reference") 

410 >>> plt.plot(low_pass_filter(rand_series, 0.2, 2), label="filtered") 

411 >>> plt.legend() 

412 >>> plt.show() 

413 

414 """ 

415 if len(data.shape) > 1: # Check if given data has multiple dimensions 

416 if data.shape[1] == 1: 

417 data = data[:, 0] # Resize to 1D-Array 

418 else: 

419 raise ValueError("Given data has multiple dimensions. " 

420 "Only one-dimensional arrays are supported in this function.") 

421 _filter_order = int(filter_order) 

422 numerator, denominator = signal.butter(N=_filter_order, Wn=crit_freq, 

423 btype='low', analog=False, output='ba') 

424 output = signal.filtfilt(numerator, denominator, data) 

425 return output 

426 

427 

428def moving_average(data: np.ndarray, window: int) -> np.ndarray: 

429 """ 

430 Creates a pandas Series as moving average of the input series. 

431 

432 :param np.ndarray data: 

433 For dataframe e.g. df['a_col_name'].values 

434 :param int window: 

435 sample rate of input 

436 :return: numpy.array 

437 shape has (###,). First and last points of input Series are extrapolated as constant 

438 values (hold first and last point). 

439 

440 Example: 

441 

442 >>> import numpy as np 

443 >>> import matplotlib.pyplot as plt 

444 >>> series = np.sin(np.linspace(-30, 30, 1000)) 

445 >>> plt.plot(series, label="reference") 

446 >>> plt.plot(moving_average(series, 10), label="window=10") 

447 >>> plt.plot(moving_average(series, 50), label="window=50") 

448 >>> plt.plot(moving_average(series, 100), label="window=100") 

449 >>> plt.legend() 

450 >>> plt.show() 

451 

452 """ 

453 if len(data.shape) > 1: # Check if given data has multiple dimensions 

454 if data.shape[1] == 1: 

455 data = data[:, 0] # Resize to 1D-Array 

456 else: 

457 raise ValueError("Given data has multiple dimensions. " 

458 "Only one-dimensional arrays are supported in this function.") 

459 window = int(window) 

460 weights = np.repeat(1.0, window) / window 

461 sma = np.convolve(data, weights, 'valid') 

462 # Create array with first entries and window/2 elements 

463 fill_start = np.full((int(np.floor(window / 2)), 1), sma[0]) 

464 # Same with last value of -data- 

465 fill_end = np.full((int(np.ceil(window / 2)) - 1, 1), sma[-1]) 

466 # Stack the arrays 

467 sma = np.concatenate((fill_start[:, 0], sma, fill_end[:, 0]), axis=0) 

468 return sma 

469 

470 

471def create_on_off_signal( 

472 df: Union[pd.DataFrame, "TimeSeriesData"], 

473 col_names: list, 

474 threshold: Union[float, list], 

475 col_names_new: list, 

476 tags: Union[list, str] = "raw", 

477 new_tag: str = "converted_signal" 

478): 

479 """ 

480 Create on and off signals based on the given threshold for all column names. 

481 

482 :param pd.DataFame,TimeSeriesData df: 

483 DataFrame with the data to process 

484 :param list col_names: 

485 Column names of variables to convert to signals 

486 :param float,list threshold: 

487 Threshold for all column-names (single float) or 

488 a list with specific thresholds for specific columns. 

489 :param list col_names_new: 

490 New name for the signal-column 

491 :param str,list tags: 

492 If a 2-Level DataFrame for TimeSeriesData is used, one has to 

493 specify the tag of the variables. Default value is to use the "raw" 

494 tag set in the TimeSeriesClass. However, one can specify a list 

495 (Different tag for each variable), or on can pass a string 

496 (same tags for all given variables) 

497 :param str new_tag: 

498 The tag the newly created variable will hold. This can be used to 

499 indicate where the signal was converted from. 

500 :return: pd.DataFrame 

501 Copy of DataFrame with the created signals added. 

502 

503 Example: 

504 

505 >>> import matplotlib.pyplot as plt 

506 >>> import numpy as np 

507 >>> df = pd.DataFrame({"P_el": np.sin(np.linspace(-20, 20, 10000))*100}) 

508 >>> df = create_on_off_signal(df, col_names=["P_el"], 

509 >>> threshold=25, col_names_new=["Device On"]) 

510 >>> plt.plot(df) 

511 >>> plt.show() 

512 """ 

513 if len(col_names) != len(col_names_new): 

514 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, " 

515 f"col_names_new: {len(col_names_new)}") 

516 if isinstance(threshold, list): 

517 if len(col_names) != len(threshold): 

518 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, " 

519 f"threshold: {len(threshold)}") 

520 else: 

521 threshold = [threshold for _ in enumerate(col_names)] 

522 # Do on_off signal creation for all desired columns 

523 df_copy = df.copy() 

524 if isinstance(df.columns, pd.MultiIndex): 

525 # Convert given tags to a list 

526 if isinstance(tags, str): 

527 tags = [tags for _ in enumerate(col_names)] 

528 

529 for i, _ in enumerate(col_names): 

530 # Create zero-array 

531 df_copy.loc[:, (col_names_new[i], new_tag)] = 0.0 

532 # Change all values to 1.0 according to threshold 

533 df_copy.loc[ 

534 df_copy[col_names[i], tags[i]] >= threshold[i], (col_names_new[i], new_tag)] = 1.0 

535 else: 

536 for i, _ in enumerate(col_names): 

537 # Create zero-array 

538 df_copy.loc[:, col_names_new[i]] = 0.0 

539 # Change all values to 1.0 according to threshold 

540 df_copy.loc[df_copy[col_names[i]] >= threshold[i], col_names_new[i]] = 1.0 

541 return df_copy 

542 

543 

544def number_lines_totally_na(df: Union[pd.DataFrame, "TimeSeriesData"]) -> int: 

545 """ 

546 Returns the number of rows in the given dataframe 

547 that are filled with NaN-values. 

548 

549 :param pd.DataFrame,TimeSeriesData df: 

550 Given dataframe to process 

551 :return: int 

552 Number of NaN-Rows. 

553 

554 Example: 

555 

556 >>> import numpy as np 

557 >>> import pandas as pd 

558 >>> dim = np.random.randint(100) + 10 

559 >>> nan_col = [np.NaN for i in range(dim)] 

560 >>> col = [i for i in range(dim)] 

561 >>> df_nan = pd.DataFrame({"col_1":nan_col, "col_2":nan_col}) 

562 >>> df_normal = pd.DataFrame({"col_1":nan_col, "col_2":col}) 

563 >>> print(number_lines_totally_na(df_nan)-dim) 

564 0 

565 >>> print(number_lines_totally_na(df_normal)) 

566 0 

567 """ 

568 if not isinstance(df, pd.DataFrame): 

569 raise TypeError('Input must be a pandas data frame') 

570 counter = 0 

571 for _, row in df.iterrows(): 

572 # Check if the whole row is filled with NaNs. 

573 if all(row.isnull()): 

574 counter += 1 

575 return counter 

576 

577 

578def z_score(x: np.ndarray, limit=3) -> np.ndarray: 

579 """ 

580 Calculate the z-score using the mea 

581 and standard deviation of the given data. 

582 

583 :param np.array x: 

584 For dataframe e.g. df['a_col_name'].values 

585 :param float limit: default 3 

586 Lower limit for required z-score 

587 :return: np.array iqr: 

588 modified z score 

589 

590 Example: 

591 

592 >>> import numpy as np 

593 >>> normal_dis = np.random.normal(0, 1, 1000) 

594 >>> res = z_score(normal_dis, limit=2) 

595 >>> values = normal_dis[res] 

596 

597 """ 

598 mean = np.mean(x) 

599 standard_deviation = np.std(x) 

600 z_score_value = (x - mean) / standard_deviation 

601 return np.where(np.abs(z_score_value) > limit)[0] 

602 

603 

604def modified_z_score(x: np.ndarray, limit: float = 3.5) -> np.ndarray: 

605 """ 

606 Calculate the modified z-score using the median 

607 and median average deviation of the given data. 

608 

609 :param np.array x: 

610 For dataframe e.g. df['a_col_name'].values 

611 :param float limit: default 3.5 

612 Lower limit for required z-score 

613 :return: np.array iqr: 

614 modified z score 

615 

616 Example: 

617 

618 >>> import numpy as np 

619 >>> normal_dis = np.random.normal(0, 1, 1000) 

620 >>> res = modified_z_score(normal_dis, limit=2) 

621 >>> values = normal_dis[res] 

622 

623 """ 

624 median = np.median(x) 

625 median_average_deviation = np.median(np.abs(x - median)) 

626 z_score_mod = 0.6745 * (x - median) / median_average_deviation 

627 return np.where(np.abs(z_score_mod) > limit)[0] 

628 

629 

630def interquartile_range(x: np.ndarray) -> np.ndarray: 

631 """ 

632 Calculate interquartile range of given array. 

633 Returns the indices of values outside of the interquartile range. 

634 

635 :param np.array x: 

636 For dataframe e.g. df['a_col_name'].values 

637 :return: np.array iqr: 

638 Array matching the interquartile-range 

639 

640 Example: 

641 

642 >>> import numpy as np 

643 >>> normal_dis = np.random.normal(0, 1, 1000) 

644 >>> res = interquartile_range(normal_dis) 

645 >>> values = normal_dis[res] 

646 

647 """ 

648 quartile_1, quartile_3 = np.percentile(x, [25, 75]) 

649 iqr = quartile_3 - quartile_1 

650 lower = quartile_1 - (iqr * 1.5) 

651 upper = quartile_3 + (iqr * 1.5) 

652 return np.where((x > upper) | (x < lower))[0] 

653 

654 

655def cross_validation(x, y, test_size=0.3): 

656 """ 

657 Split data set randomly with test_size 

658 (if test_size = 0.30 --> 70 % are training data). 

659 You can use this function for segmentation tasks. 

660 Time-series-data may not be splitted with this function 

661 as the results are not coherent (time-wise). 

662 

663 :param x: 

664 Indexables with same length / shape[0] as y. 

665 Allowed inputs are lists, numpy arrays, scipy-sparse 

666 matrices or pandas dataframes. 

667 :param list,np.ndarray,pd.DataFrame y: 

668 Indexables with same length / shape[0] as x. 

669 Allowed inputs are lists, numpy arrays, scipy-sparse 

670 matrices or pandas dataframes. 

671 :param float test_size: 

672 Value between 0 and 1 specifying what percentage of the data 

673 will be used for testing. 

674 :return: list 

675 Split data into 4 objects. The order is: 

676 x_train, x_test, y_train, y_test 

677 

678 Example: 

679 

680 >>> import numpy as np 

681 >>> x = np.random.rand(100) 

682 >>> y = np.random.rand(100) 

683 >>> ret = cross_validation(x, y) 

684 >>> len(ret) 

685 4 

686 """ 

687 return model_selection.train_test_split(x, y, test_size=test_size) 

688 

689 

690def get_df_index_frequency_mean_and_std(df_index: pd.Index, verbose: bool = False): 

691 """ 

692 Function to get the mean and std of the index-frequency. 

693 If the index is a DatetimeIndex, the seconds are converted from nanoseconds 

694 to seconds. 

695 Else, seconds are assumed as values. 

696 

697 :param pd.Index df_index: 

698 Time index. 

699 :param bool verbose: 

700 Default false. If true, additional to the mean value and standard deviation, 

701 the standard error of the mean and number of time steps are returned. 

702 

703 :returns: 

704 float: Mean value 

705 float: Standard deviation 

706 """ 

707 

708 if isinstance(df_index, pd.DatetimeIndex): 

709 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) * 1e-9 

710 else: 

711 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) 

712 if verbose: 

713 return np.mean(index_in_s), np.std(index_in_s), st.sem(index_in_s), len(index_in_s) 

714 else: 

715 return np.mean(index_in_s), np.std(index_in_s)