Coverage for ebcpy/preprocessing.py: 95%

173 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-09-19 12:21 +0000

1""" 

2This general overview may help you find the function you need: 

3 

4- Remove duplicate rows by averaging the values 

5 (``build_average_on_duplicate_rows``) 

6- Convert any integer or float index into a datetime index 

7 (``convert_index_to_datetime_index``) 

8- Resample a given time-series on a given frequency 

9 (``clean_and_space_equally_time_series``) 

10- Apply a low-pass-filter (``low_pass_filter``) 

11- Apply a moving average to flatten disturbances 

12 in your measured data (``moving_average``) 

13- Convert e.g. an electrical power signal into a binary 

14 control signal (on-off) based on a threshold (``create_on_off_signal``) 

15- Find the number of lines without any values in it (``number_lines_totally_na``) 

16- Split a data-set into training and test set according to 

17 cross-validation (``cross_validation``) 

18 

19All functions in the pre-processing module should have a doctest. We refer to the example 

20in this doctest for a better understanding of the functions. If you don't understand 

21the behaviour of a function or the meaning, please raise an issue. 

22""" 

23import warnings 

24import logging 

25from datetime import datetime 

26from scipy import signal 

27from sklearn import model_selection 

28from pandas.tseries.frequencies import to_offset 

29import numpy as np 

30import pandas as pd 

31import scipy.stats as st 

32from ebcpy import data_types 

33 

34logger = logging.getLogger(__name__) 

35 

36 

37def build_average_on_duplicate_rows(df): 

38 """ 

39 If the dataframe has duplicate-indexes, the average 

40 value of all those indexes is calculated and given to 

41 the first occurrence of this duplicate index. Therefore, 

42 any dataFrame should be already sorted before calling this 

43 function. 

44 

45 :param pd.DataFame df: 

46 DataFrame with the data to process 

47 :return: pd.DataFame 

48 The processed DataFame 

49 

50 Example: 

51 

52 >>> df = pd.DataFrame({"idx": np.ones(5), "val": np.arange(5)}).set_index("idx") 

53 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)) 

54 >>> print(df) 

55 val 

56 idx 

57 2007-01-01 00:00:01 0 

58 2007-01-01 00:00:01 1 

59 2007-01-01 00:00:01 2 

60 2007-01-01 00:00:01 3 

61 2007-01-01 00:00:01 4 

62 >>> print(build_average_on_duplicate_rows(df)) 

63 val 

64 idx 

65 2007-01-01 00:00:01 2.0 

66 """ 

67 # Find entries that are exactly the same timestamp 

68 double_ind = df.index[df.index.duplicated()].unique() 

69 # Calculate the mean value 

70 mean_values = [] 

71 for item in double_ind: 

72 mean_values.append(df.loc[item].values.mean(axis=0)) 

73 # Delete duplicate indices 

74 df_dropped = df[~df.index.duplicated(keep='first')].copy() 

75 

76 # Set mean values in rows that were duplicates before 

77 for idx, values in zip(double_ind, mean_values): 

78 df_dropped.loc[idx] = values 

79 

80 return df_dropped 

81 

82 

83def convert_index_to_datetime_index(df, unit_of_index="s", origin=datetime.now(), 

84 inplace: bool = False): 

85 """ 

86 Converts the index of the given DataFrame to a 

87 pandas.core.indexes.datetimes.DatetimeIndex. 

88 

89 :param pd.DataFrame df: 

90 dataframe with index not being a DateTime. 

91 Only numeric indexes are supported. Every integer 

92 is interpreted with the given unit, standard form 

93 is in seocnds. 

94 :param str unit_of_index: default 's' 

95 The unit of the given index. Used to convert to 

96 total_seconds later on. 

97 :param datetime.datetime origin: 

98 The reference datetime object for the first index. 

99 Default is the current system time. 

100 :param bool inplace: 

101 If True, performs operation inplace and returns None. 

102 :return: df 

103 Copy of DataFrame with correct index for usage in this 

104 framework. 

105 

106 Example: 

107 

108 >>> import pandas as pd 

109 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD')) 

110 >>> print(df) 

111 A B C D 

112 0 1.0 1.0 1.0 1.0 

113 1 1.0 1.0 1.0 1.0 

114 2 1.0 1.0 1.0 1.0 

115 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))) 

116 A B C D 

117 2007-01-01 00:00:00 1.0 1.0 1.0 1.0 

118 2007-01-01 00:00:01 1.0 1.0 1.0 1.0 

119 2007-01-01 00:00:02 1.0 1.0 1.0 1.0 

120 

121 """ 

122 # Check for unit of given index. Maybe one uses hour-based data. 

123 _unit_conversion_to_seconds = {"ms": 1e3, 

124 "s": 1, 

125 "min": 1 / 60, 

126 "h": 1 / 3600, 

127 "d": 1 / 86400} 

128 if unit_of_index not in _unit_conversion_to_seconds: 

129 raise ValueError("Given unit_of_index is not supported.") 

130 _unit_factor_to_seconds = _unit_conversion_to_seconds.get(unit_of_index) 

131 

132 # Convert 

133 old_index = df.index.copy() 

134 # Check if already converted: 

135 if isinstance(old_index, pd.DatetimeIndex): 

136 return df 

137 # Convert strings to numeric values. 

138 old_index = pd.to_numeric(old_index) 

139 # Convert to seconds. 

140 old_index /= _unit_factor_to_seconds 

141 # Alter the index 

142 index = pd.to_datetime(old_index, unit="s", origin=origin) 

143 if inplace: 

144 df.index = index 

145 return None 

146 df_copy = df.copy() 

147 df_copy.index = index 

148 return df_copy 

149 

150 

151def convert_datetime_index_to_float_index(df, offset=0, inplace: bool = False): 

152 """ 

153 Convert a datetime-based index to FloatIndex (in seconds). 

154 Seconds are used as a standard unit as simulation software 

155 outputs data in seconds (e.g. Modelica) 

156 

157 :param pd.DataFrame df: 

158 DataFrame to be converted to FloatIndex 

159 :param float offset: 

160 Offset in seconds 

161 :param bool inplace: 

162 If True, performs operation inplace and returns None. 

163 :return: pd.DataFrame df: 

164 DataFrame with correct index 

165 

166 Example: 

167 

168 >>> import pandas as pd 

169 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD')) 

170 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))) 

171 A B C D 

172 2007-01-01 00:00:00 1.0 1.0 1.0 1.0 

173 2007-01-01 00:00:01 1.0 1.0 1.0 1.0 

174 2007-01-01 00:00:02 1.0 1.0 1.0 1.0 

175 >>> print(convert_datetime_index_to_float_index(df)) 

176 A B C D 

177 0.0 1.0 1.0 1.0 1.0 

178 1.0 1.0 1.0 1.0 1.0 

179 2.0 1.0 1.0 1.0 1.0 

180 """ 

181 # Check correct input 

182 if not isinstance(df.index, pd.DatetimeIndex): 

183 raise IndexError("Given DataFrame has no DatetimeIndex, conversion not possible") 

184 

185 new_index = np.round(pd.to_timedelta(df.index - df.index[0]).total_seconds(), 4) + offset 

186 if inplace: 

187 df.index = new_index 

188 return None 

189 df_copy = df.copy() 

190 df_copy.index = new_index 

191 return df_copy 

192 

193 

194def time_based_weighted_mean(df): 

195 """ 

196 Creates the weighted mean according to time index that does not need to be equidistant. 

197 Further info: 

198 https://stackoverflow.com/questions/26343252/create-a-weighted-mean-for-a-irregular-timeseries-in-pandas 

199 

200 :param pd.DataFrame df: 

201 A pandas DataFrame with DatetimeIndex. 

202 :return np.array: 

203 A numpy array containing weighted means of all columns 

204 

205 Example: 

206 

207 >>> from datetime import datetime 

208 >>> import numpy as np 

209 >>> import pandas as pd 

210 >>> time_vec = [datetime(2007,1,1,0,0), 

211 >>> datetime(2007,1,1,0,0), 

212 >>> datetime(2007,1,1,0,5), 

213 >>> datetime(2007,1,1,0,7), 

214 >>> datetime(2007,1,1,0,10)] 

215 >>> df = pd.DataFrame({'A': [1,2,4,3,6], 'B': [11,12,14,13,16]}, index=time_vec) 

216 >>> print(time_based_weighted_mean(df=df)) 

217 [ 3.55 13.55] 

218 """ 

219 

220 if not isinstance(df.index, pd.DatetimeIndex): 

221 raise IndexError(f"df.index must be DatetimeIndex, but it is {type(df.index)}.") 

222 

223 time_delta = [(x - y).total_seconds() for x, y in zip(df.index[1:], df.index[:-1])] 

224 weights = [x + y for x, y in zip([0] + time_delta, time_delta + [0])] 

225 # Create empty numpy array 

226 res = np.empty(len(df.columns)) 

227 res[:] = np.nan 

228 for i, col_name in enumerate(df.columns): 

229 res[i] = np.average(df[col_name], weights=weights) 

230 return res 

231 

232 

233def clean_and_space_equally_time_series(df, desired_freq, confidence_warning=0.95): 

234 """ 

235 Function for cleaning of the given dataFrame and interpolating 

236 based on the given desired frequency. Linear interpolation 

237 is used. 

238 

239 :param pd.DataFrame df: 

240 Unclean DataFrame. Needs to have a pd.DateTimeIndex 

241 :param str desired_freq: 

242 Frequency to determine number of elements in processed dataframe. 

243 Options are for example: 

244 - s: second-based 

245 - 5s: Every 5 seconds 

246 - 6min: Every 6 minutes 

247 This also works for h, d, m, y, ms etc. 

248 :param float confidence_warning: 

249 Value to check the confidence interval of input data without 

250 a defined frequency. If the desired frequency is outside of 

251 the resulting confidence interval, a warning is issued. 

252 :return: pd.DataFrame 

253 Cleaned and equally spaced data-frame 

254 

255 Example: 

256 **Note:** The example is for random data. Try out different sampling 

257 frequencys. You will be warned if the samping rate is to high or to low. 

258 

259 >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), 

260 >>> columns=list('ABCD')).set_index("A").sort_index() 

261 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)) 

262 >>> clean_and_space_equally_time_series(df, "30s") 

263 >>> import matplotlib.pyplot as plt 

264 >>> plt.plot(df["B"], label="Raw data") 

265 >>> df = clean_and_space_equally_time_series(df.copy(), "1500ms") 

266 >>> plt.plot(df["B"], label="Clead and spaced equally") 

267 >>> plt.legend() 

268 >>> plt.show() 

269 

270 .. versionchanged:: 0.1.7 

271 """ 

272 # Convert indexes to datetime_index: 

273 if not isinstance(df.index, pd.DatetimeIndex): 

274 if isinstance(df, data_types.TimeSeriesData): 

275 raise TypeError("TimeSeriesData needs a DateTimeIndex for executing this function. " 

276 "Call convert_index_to_datetime_index() to convert any index to " 

277 "a DateTimeIndex") 

278 # Else 

279 raise TypeError("DataFrame needs a DateTimeIndex for executing this function. " 

280 "Call convert_index_to_datetime_index() to convert any index to " 

281 "a DateTimeIndex") 

282 # %% Check DataFrame for NANs 

283 # Create a pandas Series with number of invalid values for each column of df 

284 series_with_na = df.isnull().sum() 

285 for name in series_with_na.index: 

286 if series_with_na.loc[name] > 0: 

287 # Print only columns with invalid values 

288 logger.info("%s has following number of invalid " 

289 "values\n %s", name, series_with_na.loc[name]) 

290 # Drop all rows where at least one NA exists 

291 df_temp = df.dropna(how='any') 

292 

293 # Check if DataFrame still has non-numeric-values: 

294 if not all(df_temp.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())): 

295 raise ValueError("Given DataFrame contains non-numeric values.") 

296 

297 # Merge duplicate rows using mean. 

298 df_temp = build_average_on_duplicate_rows(df_temp) 

299 

300 # Make user warning for two cases: Upsampling and data input without a freq: 

301 # Check if the frequency differs 

302 old_freq, old_freq_std, old_freq_sem, time_steps = get_df_index_frequency_mean_and_std( 

303 df_index=df_temp.index, 

304 verbose=True) 

305 if old_freq_std > 0: 

306 _ns_to_s = 1e9 

307 # Calculate confidence interval of the mean value of the old frequency 

308 cfd_int = st.t.interval(confidence_warning, 

309 time_steps - 1, 

310 loc=old_freq, 

311 scale=old_freq_sem) 

312 # Convert to timedelta 

313 cfd_int = pd.to_timedelta((cfd_int[0] * _ns_to_s, cfd_int[1] * _ns_to_s)) 

314 _td_freq = pd.to_timedelta(desired_freq) 

315 if (_td_freq < cfd_int[0]) or (_td_freq > cfd_int[1]): 

316 in_seconds = np.array(cfd_int.values.tolist()) / _ns_to_s # From nanoseconds 

317 warnings.warn(f"Input data has no frequency, but the desired frequency " 

318 f"{_td_freq.value / _ns_to_s} seconds is outside the given " 

319 f"confidence interval {in_seconds} (in seconds) " 

320 "Carefully check the result to see if you " 

321 "introduced errors to the data.") 

322 

323 # %% Re-sampling to new frequency with linear interpolation 

324 # Create new equally spaced DatetimeIndex. Last entry is always < df.index[-1] 

325 time_index = pd.date_range(start=df.index[0], end=df.index[-1], freq=desired_freq) 

326 new_freq, _ = get_df_index_frequency_mean_and_std(df_index=time_index) 

327 

328 # Check if the user is trying to upsample the data: 

329 if old_freq_std == 0: 

330 if new_freq > old_freq: 

331 warnings.warn("You are upsampling your data. This may be dangerous. " 

332 "Carefully check the result to see if you introduced errors to the data.") 

333 

334 # Create an empty data frame 

335 # If multi-columns is used, first get the old index and make it empty: 

336 multi_cols = df_temp.columns 

337 if isinstance(multi_cols, pd.MultiIndex): 

338 empty_multi_cols = pd.MultiIndex.from_product([[] for _ in range(multi_cols.nlevels)], 

339 names=multi_cols.names) 

340 df_time_temp = pd.DataFrame(index=time_index, columns=empty_multi_cols) 

341 else: 

342 df_time_temp = pd.DataFrame(index=time_index) 

343 

344 # Insert temporary time_index into df. fill_value = 0 can only be used, 

345 # since all NaNs should be eliminated prior 

346 df_temp = df_temp.radd(df_time_temp, axis='index', fill_value=0) 

347 del df_time_temp 

348 

349 # Interpolate linearly according to time index 

350 df_temp.interpolate(method='time', axis=0, inplace=True) 

351 # Determine Timedelta between current first index entry 

352 # in df and the first index entry that would be created 

353 # when applying df.resample() without loffset 

354 delta_time = df.index[0] - \ 

355 df_temp.resample(rule=desired_freq).first().first(desired_freq).index[0] 

356 # Resample to equally spaced index. 

357 # All fields should already have a value. Thus NaNs and maybe +/- infs 

358 # should have been filtered beforehand. 

359 

360 # Check if given dataframe was a TimeSeriesData object and of so, convert it as such 

361 if isinstance(df_temp, data_types.TimeSeriesData): 

362 df_temp = df_temp.resample(rule=desired_freq).first() 

363 df_temp.index = df_temp.index + to_offset(delta_time) 

364 df_temp = data_types.TimeSeriesData(df_temp) 

365 else: 

366 df_temp = df_temp.resample(rule=desired_freq).first() 

367 df_temp.index = df_temp.index + to_offset(delta_time) 

368 del delta_time 

369 

370 return df_temp 

371 

372 

373def low_pass_filter(data, crit_freq, filter_order): 

374 """ 

375 Create a low pass filter with given order and frequency. 

376 

377 :param numpy.ndarray data: 

378 For dataframe e.g. df['a_col_name'].values 

379 :param float crit_freq: 

380 The critical frequency or frequencies. 

381 :param int filter_order: 

382 The order of the filter 

383 :return: numpy.ndarray 

384 

385 Example: 

386 

387 >>> import numpy as np 

388 >>> import matplotlib.pyplot as plt 

389 >>> rand_series = np.random.rand(100) 

390 >>> plt.plot(rand_series, label="reference") 

391 >>> plt.plot(low_pass_filter(rand_series, 0.2, 2), label="filtered") 

392 >>> plt.legend() 

393 >>> plt.show() 

394 

395 """ 

396 if len(data.shape) > 1: # Check if given data has multiple dimensions 

397 if data.shape[1] == 1: 

398 data = data[:, 0] # Resize to 1D-Array 

399 else: 

400 raise ValueError("Given data has multiple dimensions. " 

401 "Only one-dimensional arrays are supported in this function.") 

402 _filter_order = int(filter_order) 

403 numerator, denominator = signal.butter(N=_filter_order, Wn=crit_freq, 

404 btype='low', analog=False, output='ba') 

405 output = signal.filtfilt(numerator, denominator, data) 

406 return output 

407 

408 

409def moving_average(data, window): 

410 """ 

411 Creates a pandas Series as moving average of the input series. 

412 

413 :param pd.Series data: 

414 For dataframe e.g. df['a_col_name'].values 

415 :param int window: 

416 sample rate of input 

417 :return: numpy.array 

418 shape has (###,). First and last points of input Series are extrapolated as constant 

419 values (hold first and last point). 

420 

421 Example: 

422 

423 >>> import numpy as np 

424 >>> import matplotlib.pyplot as plt 

425 >>> series = np.sin(np.linspace(-30, 30, 1000)) 

426 >>> plt.plot(series, label="reference") 

427 >>> plt.plot(moving_average(series, 10), label="window=10") 

428 >>> plt.plot(moving_average(series, 50), label="window=50") 

429 >>> plt.plot(moving_average(series, 100), label="window=100") 

430 >>> plt.legend() 

431 >>> plt.show() 

432 

433 """ 

434 if len(data.shape) > 1: # Check if given data has multiple dimensions 

435 if data.shape[1] == 1: 

436 data = data[:, 0] # Resize to 1D-Array 

437 else: 

438 raise ValueError("Given data has multiple dimensions. " 

439 "Only one-dimensional arrays are supported in this function.") 

440 window = int(window) 

441 weights = np.repeat(1.0, window) / window 

442 sma = np.convolve(data, weights, 'valid') 

443 # Create array with first entries and window/2 elements 

444 fill_start = np.full((int(np.floor(window / 2)), 1), sma[0]) 

445 # Same with last value of -data- 

446 fill_end = np.full((int(np.ceil(window / 2)) - 1, 1), sma[-1]) 

447 # Stack the arrays 

448 sma = np.concatenate((fill_start[:, 0], sma, fill_end[:, 0]), axis=0) 

449 return sma 

450 

451 

452def create_on_off_signal(df, col_names, threshold, col_names_new, 

453 tags="raw", new_tag="converted_signal"): 

454 """ 

455 Create on and off signals based on the given threshold for all column names. 

456 

457 :param pd.DataFame df: 

458 DataFrame with the data to process 

459 :param list col_names: 

460 Column names of variables to convert to signals 

461 :param float,list threshold: 

462 Threshold for all column-names (single float) or 

463 a list with specific thresholds for specific columns. 

464 :param list col_names_new: 

465 New name for the signal-column 

466 :param str,list tags: 

467 If a 2-Level DataFrame for TimeSeriesData is used, one has to 

468 specify the tag of the variables. Default value is to use the "raw" 

469 tag set in the TimeSeriesClass. However, one can specify a list 

470 (Different tag for each variable), or on can pass a string 

471 (same tags for all given variables) 

472 :param str new_tag: 

473 The tag the newly created variable will hold. This can be used to 

474 indicate where the signal was converted from. 

475 :return: pd.DataFrame 

476 Copy of DataFrame with the created signals added. 

477 

478 Example: 

479 

480 >>> import matplotlib.pyplot as plt 

481 >>> import numpy as np 

482 >>> df = pd.DataFrame({"P_el": np.sin(np.linspace(-20, 20, 10000))*100}) 

483 >>> df = create_on_off_signal(df, col_names=["P_el"], 

484 >>> threshold=25, col_names_new=["Device On"]) 

485 >>> plt.plot(df) 

486 >>> plt.show() 

487 """ 

488 if len(col_names) != len(col_names_new): 

489 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, " 

490 f"col_names_new: {len(col_names_new)}") 

491 if isinstance(threshold, list): 

492 if len(col_names) != len(threshold): 

493 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, " 

494 f"threshold: {len(threshold)}") 

495 else: 

496 threshold = [threshold for _ in enumerate(col_names)] 

497 # Do on_off signal creation for all desired columns 

498 df_copy = df.copy() 

499 if isinstance(df.columns, pd.MultiIndex): 

500 # Convert given tags to a list 

501 if isinstance(tags, str): 

502 tags = [tags for _ in enumerate(col_names)] 

503 

504 for i, _ in enumerate(col_names): 

505 # Create zero-array 

506 df_copy.loc[:, (col_names_new[i], new_tag)] = 0.0 

507 # Change all values to 1.0 according to threshold 

508 df_copy.loc[ 

509 df_copy[col_names[i], tags[i]] >= threshold[i], (col_names_new[i], new_tag)] = 1.0 

510 else: 

511 for i, _ in enumerate(col_names): 

512 # Create zero-array 

513 df_copy.loc[:, col_names_new[i]] = 0.0 

514 # Change all values to 1.0 according to threshold 

515 df_copy.loc[df_copy[col_names[i]] >= threshold[i], col_names_new[i]] = 1.0 

516 return df_copy 

517 

518 

519def number_lines_totally_na(df): 

520 """ 

521 Returns the number of rows in the given dataframe 

522 that are filled with NaN-values. 

523 

524 :param pd.DataFrame df: 

525 Given dataframe to process 

526 :return: int 

527 Number of NaN-Rows. 

528 

529 Example: 

530 

531 >>> import numpy as np 

532 >>> import pandas as pd 

533 >>> dim = np.random.randint(100) + 10 

534 >>> nan_col = [np.NaN for i in range(dim)] 

535 >>> col = [i for i in range(dim)] 

536 >>> df_nan = pd.DataFrame({"col_1":nan_col, "col_2":nan_col}) 

537 >>> df_normal = pd.DataFrame({"col_1":nan_col, "col_2":col}) 

538 >>> print(number_lines_totally_na(df_nan)-dim) 

539 0 

540 >>> print(number_lines_totally_na(df_normal)) 

541 0 

542 """ 

543 if not isinstance(df, pd.DataFrame): 

544 raise TypeError('Input must be a pandas data frame') 

545 counter = 0 

546 for _, row in df.iterrows(): 

547 # Check if the whole row is filled with NaNs. 

548 if all(row.isnull()): 

549 counter += 1 

550 return counter 

551 

552 

553def z_score(x, limit=3): 

554 """ 

555 Calculate the z-score using the mea 

556 and standard deviation of the given data. 

557 

558 :param np.array x: 

559 For dataframe e.g. df['a_col_name'].values 

560 :param float limit: default 3 

561 Lower limit for required z-score 

562 :return: np.array iqr: 

563 modified z score 

564 

565 Example: 

566 

567 >>> import numpy as np 

568 >>> normal_dis = np.random.normal(0, 1, 1000) 

569 >>> res = z_score(normal_dis, limit=2) 

570 >>> values = normal_dis[res] 

571 

572 """ 

573 mean = np.mean(x) 

574 standard_deviation = np.std(x) 

575 z_score_value = (x - mean) / standard_deviation 

576 return np.where(np.abs(z_score_value) > limit)[0] 

577 

578 

579def modified_z_score(x, limit=3.5): 

580 """ 

581 Calculate the modified z-score using the median 

582 and median average deviation of the given data. 

583 

584 :param np.array x: 

585 For dataframe e.g. df['a_col_name'].values 

586 :param float limit: default 3.5 

587 Lower limit for required z-score 

588 :return: np.array iqr: 

589 modified z score 

590 

591 Example: 

592 

593 >>> import numpy as np 

594 >>> normal_dis = np.random.normal(0, 1, 1000) 

595 >>> res = modified_z_score(normal_dis, limit=2) 

596 >>> values = normal_dis[res] 

597 

598 """ 

599 median = np.median(x) 

600 median_average_deviation = np.median(np.abs(x - median)) 

601 z_score_mod = 0.6745 * (x - median) / median_average_deviation 

602 return np.where(np.abs(z_score_mod) > limit)[0] 

603 

604 

605def interquartile_range(x): 

606 """ 

607 Calculate interquartile range of given array. 

608 Returns the indices of values outside of the interquartile range. 

609 

610 :param np.array x: 

611 For dataframe e.g. df['a_col_name'].values 

612 :return: np.array iqr: 

613 Array matching the interquartile-range 

614 

615 Example: 

616 

617 >>> import numpy as np 

618 >>> normal_dis = np.random.normal(0, 1, 1000) 

619 >>> res = interquartile_range(normal_dis) 

620 >>> values = normal_dis[res] 

621 

622 """ 

623 quartile_1, quartile_3 = np.percentile(x, [25, 75]) 

624 iqr = quartile_3 - quartile_1 

625 lower = quartile_1 - (iqr * 1.5) 

626 upper = quartile_3 + (iqr * 1.5) 

627 return np.where((x > upper) | (x < lower))[0] 

628 

629 

630def cross_validation(x, y, test_size=0.3): 

631 """ 

632 Split data set randomly with test_size 

633 (if test_size = 0.30 --> 70 % are training data). 

634 You can use this function for segmentation tasks. 

635 Time-series-data may not be splitted with this function 

636 as the results are not coherent (time-wise). 

637 

638 :param x: 

639 Indexables with same length / shape[0] as y. 

640 Allowed inputs are lists, numpy arrays, scipy-sparse 

641 matrices or pandas dataframes. 

642 :param list,np.ndarray,pd.DataFrame y: 

643 Indexables with same length / shape[0] as x. 

644 Allowed inputs are lists, numpy arrays, scipy-sparse 

645 matrices or pandas dataframes. 

646 :param float test_size: 

647 Value between 0 and 1 specifying what percentage of the data 

648 will be used for testing. 

649 :return: list 

650 Split data into 4 objects. The order is: 

651 x_train, x_test, y_train, y_test 

652 

653 Example: 

654 

655 >>> import numpy as np 

656 >>> x = np.random.rand(100) 

657 >>> y = np.random.rand(100) 

658 >>> ret = cross_validation(x, y) 

659 >>> len(ret) 

660 4 

661 """ 

662 return model_selection.train_test_split(x, y, test_size=test_size) 

663 

664 

665def get_df_index_frequency_mean_and_std(df_index: pd.Index, verbose: bool = False): 

666 """ 

667 Function to get the mean and std of the index-frequency. 

668 If the index is a DatetimeIndex, the seconds are converted from nanoseconds 

669 to seconds. 

670 Else, seconds are assumed as values. 

671 

672 :param pd.Index df_index: 

673 Time index. 

674 :param bool verbose: 

675 Default false. If true, additional to the mean value and standard deviation, 

676 the standard error of the mean and number of time steps are returned. 

677 

678 :returns: 

679 float: Mean value 

680 float: Standard deviation 

681 """ 

682 

683 if isinstance(df_index, pd.DatetimeIndex): 

684 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) * 1e-9 

685 else: 

686 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) 

687 if verbose: 

688 return np.mean(index_in_s), np.std(index_in_s), st.sem(index_in_s), len(index_in_s) 

689 else: 

690 return np.mean(index_in_s), np.std(index_in_s)