Coverage for ebcpy/preprocessing.py: 95%

1"""

2This general overview may help you find the function you need:

4- Remove duplicate rows by averaging the values

5 (``build_average_on_duplicate_rows``)

6- Convert any integer or float index into a datetime index

7 (``convert_index_to_datetime_index``)

8- Resample a given time-series on a given frequency

9 (``clean_and_space_equally_time_series``)

10- Apply a low-pass-filter (``low_pass_filter``)

11- Apply a moving average to flatten disturbances

12 in your measured data (``moving_average``)

13- Convert e.g. an electrical power signal into a binary

14 control signal (on-off) based on a threshold (``create_on_off_signal``)

15- Find the number of lines without any values in it (``number_lines_totally_na``)

16- Split a data-set into training and test set according to

17 cross-validation (``cross_validation``)

19All functions in the pre-processing module should have a doctest. We refer to the example

20in this doctest for a better understanding of the functions. If you don't understand

21the behaviour of a function or the meaning, please raise an issue.

22"""

23import warnings

24import logging

25from datetime import datetime

26from scipy import signal

27from sklearn import model_selection

28from pandas.tseries.frequencies import to_offset

29import numpy as np

30import pandas as pd

31import scipy.stats as st

32from ebcpy import data_types

34logger = logging.getLogger(__name__)

37def build_average_on_duplicate_rows(df):

38 """

39 If the dataframe has duplicate-indexes, the average

40 value of all those indexes is calculated and given to

41 the first occurrence of this duplicate index. Therefore,

42 any dataFrame should be already sorted before calling this

43 function.

45 :param pd.DataFame df:

46 DataFrame with the data to process

47 :return: pd.DataFame

48 The processed DataFame

50 Example:

52 >>> df = pd.DataFrame({"idx": np.ones(5), "val": np.arange(5)}).set_index("idx")

53 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))

54 >>> print(df)

55 val

56 idx

57 2007-01-01 00:00:01 0

58 2007-01-01 00:00:01 1

59 2007-01-01 00:00:01 2

60 2007-01-01 00:00:01 3

61 2007-01-01 00:00:01 4

62 >>> print(build_average_on_duplicate_rows(df))

63 val

64 idx

65 2007-01-01 00:00:01 2.0

66 """

67 # Find entries that are exactly the same timestamp

68 double_ind = df.index[df.index.duplicated()].unique()

69 # Calculate the mean value

70 mean_values = []

71 for item in double_ind:

72 mean_values.append(df.loc[item].values.mean(axis=0))

73 # Delete duplicate indices

74 df_dropped = df[~df.index.duplicated(keep='first')].copy()

76 # Set mean values in rows that were duplicates before

77 for idx, values in zip(double_ind, mean_values):

78 df_dropped.loc[idx] = values

80 return df_dropped

83def convert_index_to_datetime_index(df, unit_of_index="s", origin=datetime.now(),

84 inplace: bool = False):

85 """

86 Converts the index of the given DataFrame to a

87 pandas.core.indexes.datetimes.DatetimeIndex.

89 :param pd.DataFrame df:

90 dataframe with index not being a DateTime.

91 Only numeric indexes are supported. Every integer

92 is interpreted with the given unit, standard form

93 is in seocnds.

94 :param str unit_of_index: default 's'

95 The unit of the given index. Used to convert to

96 total_seconds later on.

97 :param datetime.datetime origin:

98 The reference datetime object for the first index.

99 Default is the current system time.

100 :param bool inplace:

101 If True, performs operation inplace and returns None.

102 :return: df

103 Copy of DataFrame with correct index for usage in this

104 framework.

105

106 Example:

107

108 >>> import pandas as pd

109 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))

110 >>> print(df)

111 A B C D

112 0 1.0 1.0 1.0 1.0

113 1 1.0 1.0 1.0 1.0

114 2 1.0 1.0 1.0 1.0

115 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))

116 A B C D

117 2007-01-01 00:00:00 1.0 1.0 1.0 1.0

118 2007-01-01 00:00:01 1.0 1.0 1.0 1.0

119 2007-01-01 00:00:02 1.0 1.0 1.0 1.0

120

121 """

122 # Check for unit of given index. Maybe one uses hour-based data.

123 _unit_conversion_to_seconds = {"ms": 1e3,

124 "s": 1,

125 "min": 1 / 60,

126 "h": 1 / 3600,

127 "d": 1 / 86400}

128 if unit_of_index not in _unit_conversion_to_seconds:

129 raise ValueError("Given unit_of_index is not supported.")

130 _unit_factor_to_seconds = _unit_conversion_to_seconds.get(unit_of_index)

131

132 # Convert

133 old_index = df.index.copy()

134 # Check if already converted:

135 if isinstance(old_index, pd.DatetimeIndex):

136 return df

137 # Convert strings to numeric values.

138 old_index = pd.to_numeric(old_index)

139 # Convert to seconds.

140 old_index /= _unit_factor_to_seconds

141 # Alter the index

142 index = pd.to_datetime(old_index, unit="s", origin=origin)

143 if inplace:

144 df.index = index

145 return None

146 df_copy = df.copy()

147 df_copy.index = index

148 return df_copy

149

150

151def convert_datetime_index_to_float_index(df, offset=0, inplace: bool = False):

152 """

153 Convert a datetime-based index to FloatIndex (in seconds).

154 Seconds are used as a standard unit as simulation software

155 outputs data in seconds (e.g. Modelica)

156

157 :param pd.DataFrame df:

158 DataFrame to be converted to FloatIndex

159 :param float offset:

160 Offset in seconds

161 :param bool inplace:

162 If True, performs operation inplace and returns None.

163 :return: pd.DataFrame df:

164 DataFrame with correct index

165

166 Example:

167

168 >>> import pandas as pd

169 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))

170 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))

171 A B C D

172 2007-01-01 00:00:00 1.0 1.0 1.0 1.0

173 2007-01-01 00:00:01 1.0 1.0 1.0 1.0

174 2007-01-01 00:00:02 1.0 1.0 1.0 1.0

175 >>> print(convert_datetime_index_to_float_index(df))

176 A B C D

177 0.0 1.0 1.0 1.0 1.0

178 1.0 1.0 1.0 1.0 1.0

179 2.0 1.0 1.0 1.0 1.0

180 """

181 # Check correct input

182 if not isinstance(df.index, pd.DatetimeIndex):

183 raise IndexError("Given DataFrame has no DatetimeIndex, conversion not possible")

184

185 new_index = np.round(pd.to_timedelta(df.index - df.index[0]).total_seconds(), 4) + offset

186 if inplace:

187 df.index = new_index

188 return None

189 df_copy = df.copy()

190 df_copy.index = new_index

191 return df_copy

192

193

194def time_based_weighted_mean(df):

195 """

196 Creates the weighted mean according to time index that does not need to be equidistant.

197 Further info:

198 https://stackoverflow.com/questions/26343252/create-a-weighted-mean-for-a-irregular-timeseries-in-pandas

199

200 :param pd.DataFrame df:

201 A pandas DataFrame with DatetimeIndex.

202 :return np.array:

203 A numpy array containing weighted means of all columns

204

205 Example:

206

207 >>> from datetime import datetime

208 >>> import numpy as np

209 >>> import pandas as pd

210 >>> time_vec = [datetime(2007,1,1,0,0),

211 >>> datetime(2007,1,1,0,0),

212 >>> datetime(2007,1,1,0,5),

213 >>> datetime(2007,1,1,0,7),

214 >>> datetime(2007,1,1,0,10)]

215 >>> df = pd.DataFrame({'A': [1,2,4,3,6], 'B': [11,12,14,13,16]}, index=time_vec)

216 >>> print(time_based_weighted_mean(df=df))

217 [ 3.55 13.55]

218 """

219

220 if not isinstance(df.index, pd.DatetimeIndex):

221 raise IndexError(f"df.index must be DatetimeIndex, but it is {type(df.index)}.")

222

223 time_delta = [(x - y).total_seconds() for x, y in zip(df.index[1:], df.index[:-1])]

224 weights = [x + y for x, y in zip([0] + time_delta, time_delta + [0])]

225 # Create empty numpy array

226 res = np.empty(len(df.columns))

227 res[:] = np.nan

228 for i, col_name in enumerate(df.columns):

229 res[i] = np.average(df[col_name], weights=weights)

230 return res

231

232

233def clean_and_space_equally_time_series(df, desired_freq, confidence_warning=0.95):

234 """

235 Function for cleaning of the given dataFrame and interpolating

236 based on the given desired frequency. Linear interpolation

237 is used.

238

239 :param pd.DataFrame df:

240 Unclean DataFrame. Needs to have a pd.DateTimeIndex

241 :param str desired_freq:

242 Frequency to determine number of elements in processed dataframe.

243 Options are for example:

244 - s: second-based

245 - 5s: Every 5 seconds

246 - 6min: Every 6 minutes

247 This also works for h, d, m, y, ms etc.

248 :param float confidence_warning:

249 Value to check the confidence interval of input data without

250 a defined frequency. If the desired frequency is outside of

251 the resulting confidence interval, a warning is issued.

252 :return: pd.DataFrame

253 Cleaned and equally spaced data-frame

254

255 Example:

256 **Note:** The example is for random data. Try out different sampling

257 frequencys. You will be warned if the samping rate is to high or to low.

258

259 >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)),

260 >>> columns=list('ABCD')).set_index("A").sort_index()

261 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))

262 >>> clean_and_space_equally_time_series(df, "30s")

263 >>> import matplotlib.pyplot as plt

264 >>> plt.plot(df["B"], label="Raw data")

265 >>> df = clean_and_space_equally_time_series(df.copy(), "1500ms")

266 >>> plt.plot(df["B"], label="Clead and spaced equally")

267 >>> plt.legend()

268 >>> plt.show()

269

270 .. versionchanged:: 0.1.7

271 """

272 # Convert indexes to datetime_index:

273 if not isinstance(df.index, pd.DatetimeIndex):

274 if isinstance(df, data_types.TimeSeriesData):

275 raise TypeError("TimeSeriesData needs a DateTimeIndex for executing this function. "

276 "Call convert_index_to_datetime_index() to convert any index to "

277 "a DateTimeIndex")

278 # Else

279 raise TypeError("DataFrame needs a DateTimeIndex for executing this function. "

280 "Call convert_index_to_datetime_index() to convert any index to "

281 "a DateTimeIndex")

282 # %% Check DataFrame for NANs

283 # Create a pandas Series with number of invalid values for each column of df

284 series_with_na = df.isnull().sum()

285 for name in series_with_na.index:

286 if series_with_na.loc[name] > 0:

287 # Print only columns with invalid values

288 logger.info("%s has following number of invalid "

289 "values\n %s", name, series_with_na.loc[name])

290 # Drop all rows where at least one NA exists

291 df_temp = df.dropna(how='any')

292

293 # Check if DataFrame still has non-numeric-values:

294 if not all(df_temp.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())):

295 raise ValueError("Given DataFrame contains non-numeric values.")

296

297 # Merge duplicate rows using mean.

298 df_temp = build_average_on_duplicate_rows(df_temp)

299

300 # Make user warning for two cases: Upsampling and data input without a freq:

301 # Check if the frequency differs

302 old_freq, old_freq_std, old_freq_sem, time_steps = get_df_index_frequency_mean_and_std(

303 df_index=df_temp.index,

304 verbose=True)

305 if old_freq_std > 0:

306 _ns_to_s = 1e9

307 # Calculate confidence interval of the mean value of the old frequency

308 cfd_int = st.t.interval(confidence_warning,

309 time_steps - 1,

310 loc=old_freq,

311 scale=old_freq_sem)

312 # Convert to timedelta

313 cfd_int = pd.to_timedelta((cfd_int[0] * _ns_to_s, cfd_int[1] * _ns_to_s))

314 _td_freq = pd.to_timedelta(desired_freq)

315 if (_td_freq < cfd_int[0]) or (_td_freq > cfd_int[1]):

316 in_seconds = np.array(cfd_int.values.tolist()) / _ns_to_s # From nanoseconds

317 warnings.warn(f"Input data has no frequency, but the desired frequency "

318 f"{_td_freq.value / _ns_to_s} seconds is outside the given "

319 f"confidence interval {in_seconds} (in seconds) "

320 "Carefully check the result to see if you "

321 "introduced errors to the data.")

322

323 # %% Re-sampling to new frequency with linear interpolation

324 # Create new equally spaced DatetimeIndex. Last entry is always < df.index[-1]

325 time_index = pd.date_range(start=df.index[0], end=df.index[-1], freq=desired_freq)

326 new_freq, _ = get_df_index_frequency_mean_and_std(df_index=time_index)

327

328 # Check if the user is trying to upsample the data:

329 if old_freq_std == 0:

330 if new_freq > old_freq:

331 warnings.warn("You are upsampling your data. This may be dangerous. "

332 "Carefully check the result to see if you introduced errors to the data.")

333

334 # Create an empty data frame

335 # If multi-columns is used, first get the old index and make it empty:

336 multi_cols = df_temp.columns

337 if isinstance(multi_cols, pd.MultiIndex):

338 empty_multi_cols = pd.MultiIndex.from_product([[] for _ in range(multi_cols.nlevels)],

339 names=multi_cols.names)

340 df_time_temp = pd.DataFrame(index=time_index, columns=empty_multi_cols)

341 else:

342 df_time_temp = pd.DataFrame(index=time_index)

343

344 # Insert temporary time_index into df. fill_value = 0 can only be used,

345 # since all NaNs should be eliminated prior

346 df_temp = df_temp.radd(df_time_temp, axis='index', fill_value=0)

347 del df_time_temp

348

349 # Interpolate linearly according to time index

350 df_temp.interpolate(method='time', axis=0, inplace=True)

351 # Determine Timedelta between current first index entry

352 # in df and the first index entry that would be created

353 # when applying df.resample() without loffset

354 delta_time = df.index[0] - \

355 df_temp.resample(rule=desired_freq).first().first(desired_freq).index[0]

356 # Resample to equally spaced index.

357 # All fields should already have a value. Thus NaNs and maybe +/- infs

358 # should have been filtered beforehand.

359

360 # Check if given dataframe was a TimeSeriesData object and of so, convert it as such

361 if isinstance(df_temp, data_types.TimeSeriesData):

362 df_temp = df_temp.resample(rule=desired_freq).first()

363 df_temp.index = df_temp.index + to_offset(delta_time)

364 df_temp = data_types.TimeSeriesData(df_temp)

365 else:

366 df_temp = df_temp.resample(rule=desired_freq).first()

367 df_temp.index = df_temp.index + to_offset(delta_time)

368 del delta_time

369

370 return df_temp

371

372

373def low_pass_filter(data, crit_freq, filter_order):

374 """

375 Create a low pass filter with given order and frequency.

376

377 :param numpy.ndarray data:

378 For dataframe e.g. df['a_col_name'].values

379 :param float crit_freq:

380 The critical frequency or frequencies.

381 :param int filter_order:

382 The order of the filter

383 :return: numpy.ndarray

384

385 Example:

386

387 >>> import numpy as np

388 >>> import matplotlib.pyplot as plt

389 >>> rand_series = np.random.rand(100)

390 >>> plt.plot(rand_series, label="reference")

391 >>> plt.plot(low_pass_filter(rand_series, 0.2, 2), label="filtered")

392 >>> plt.legend()

393 >>> plt.show()

394

395 """

396 if len(data.shape) > 1: # Check if given data has multiple dimensions

397 if data.shape[1] == 1:

398 data = data[:, 0] # Resize to 1D-Array

399 else:

400 raise ValueError("Given data has multiple dimensions. "

401 "Only one-dimensional arrays are supported in this function.")

402 _filter_order = int(filter_order)

403 numerator, denominator = signal.butter(N=_filter_order, Wn=crit_freq,

404 btype='low', analog=False, output='ba')

405 output = signal.filtfilt(numerator, denominator, data)

406 return output

407

408

409def moving_average(data, window):

410 """

411 Creates a pandas Series as moving average of the input series.

412

413 :param pd.Series data:

414 For dataframe e.g. df['a_col_name'].values

415 :param int window:

416 sample rate of input

417 :return: numpy.array

418 shape has (###,). First and last points of input Series are extrapolated as constant

419 values (hold first and last point).

420

421 Example:

422

423 >>> import numpy as np

424 >>> import matplotlib.pyplot as plt

425 >>> series = np.sin(np.linspace(-30, 30, 1000))

426 >>> plt.plot(series, label="reference")

427 >>> plt.plot(moving_average(series, 10), label="window=10")

428 >>> plt.plot(moving_average(series, 50), label="window=50")

429 >>> plt.plot(moving_average(series, 100), label="window=100")

430 >>> plt.legend()

431 >>> plt.show()

432

433 """

434 if len(data.shape) > 1: # Check if given data has multiple dimensions

435 if data.shape[1] == 1:

436 data = data[:, 0] # Resize to 1D-Array

437 else:

438 raise ValueError("Given data has multiple dimensions. "

439 "Only one-dimensional arrays are supported in this function.")

440 window = int(window)

441 weights = np.repeat(1.0, window) / window

442 sma = np.convolve(data, weights, 'valid')

443 # Create array with first entries and window/2 elements

444 fill_start = np.full((int(np.floor(window / 2)), 1), sma[0])

445 # Same with last value of -data-

446 fill_end = np.full((int(np.ceil(window / 2)) - 1, 1), sma[-1])

447 # Stack the arrays

448 sma = np.concatenate((fill_start[:, 0], sma, fill_end[:, 0]), axis=0)

449 return sma

450

451

452def create_on_off_signal(df, col_names, threshold, col_names_new,

453 tags="raw", new_tag="converted_signal"):

454 """

455 Create on and off signals based on the given threshold for all column names.

456

457 :param pd.DataFame df:

458 DataFrame with the data to process

459 :param list col_names:

460 Column names of variables to convert to signals

461 :param float,list threshold:

462 Threshold for all column-names (single float) or

463 a list with specific thresholds for specific columns.

464 :param list col_names_new:

465 New name for the signal-column

466 :param str,list tags:

467 If a 2-Level DataFrame for TimeSeriesData is used, one has to

468 specify the tag of the variables. Default value is to use the "raw"

469 tag set in the TimeSeriesClass. However, one can specify a list

470 (Different tag for each variable), or on can pass a string

471 (same tags for all given variables)

472 :param str new_tag:

473 The tag the newly created variable will hold. This can be used to

474 indicate where the signal was converted from.

475 :return: pd.DataFrame

476 Copy of DataFrame with the created signals added.

477

478 Example:

479

480 >>> import matplotlib.pyplot as plt

481 >>> import numpy as np

482 >>> df = pd.DataFrame({"P_el": np.sin(np.linspace(-20, 20, 10000))*100})

483 >>> df = create_on_off_signal(df, col_names=["P_el"],

484 >>> threshold=25, col_names_new=["Device On"])

485 >>> plt.plot(df)

486 >>> plt.show()

487 """

488 if len(col_names) != len(col_names_new):

489 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "

490 f"col_names_new: {len(col_names_new)}")

491 if isinstance(threshold, list):

492 if len(col_names) != len(threshold):

493 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "

494 f"threshold: {len(threshold)}")

495 else:

496 threshold = [threshold for _ in enumerate(col_names)]

497 # Do on_off signal creation for all desired columns

498 df_copy = df.copy()

499 if isinstance(df.columns, pd.MultiIndex):

500 # Convert given tags to a list

501 if isinstance(tags, str):

502 tags = [tags for _ in enumerate(col_names)]

503

504 for i, _ in enumerate(col_names):

505 # Create zero-array

506 df_copy.loc[:, (col_names_new[i], new_tag)] = 0.0

507 # Change all values to 1.0 according to threshold

508 df_copy.loc[

509 df_copy[col_names[i], tags[i]] >= threshold[i], (col_names_new[i], new_tag)] = 1.0

510 else:

511 for i, _ in enumerate(col_names):

512 # Create zero-array

513 df_copy.loc[:, col_names_new[i]] = 0.0

514 # Change all values to 1.0 according to threshold

515 df_copy.loc[df_copy[col_names[i]] >= threshold[i], col_names_new[i]] = 1.0

516 return df_copy

517

518

519def number_lines_totally_na(df):

520 """

521 Returns the number of rows in the given dataframe

522 that are filled with NaN-values.

523

524 :param pd.DataFrame df:

525 Given dataframe to process

526 :return: int

527 Number of NaN-Rows.

528

529 Example:

530

531 >>> import numpy as np

532 >>> import pandas as pd

533 >>> dim = np.random.randint(100) + 10

534 >>> nan_col = [np.NaN for i in range(dim)]

535 >>> col = [i for i in range(dim)]

536 >>> df_nan = pd.DataFrame({"col_1":nan_col, "col_2":nan_col})

537 >>> df_normal = pd.DataFrame({"col_1":nan_col, "col_2":col})

538 >>> print(number_lines_totally_na(df_nan)-dim)

539 0

540 >>> print(number_lines_totally_na(df_normal))

541 0

542 """

543 if not isinstance(df, pd.DataFrame):

544 raise TypeError('Input must be a pandas data frame')

545 counter = 0

546 for _, row in df.iterrows():

547 # Check if the whole row is filled with NaNs.

548 if all(row.isnull()):

549 counter += 1

550 return counter

551

552

553def z_score(x, limit=3):

554 """

555 Calculate the z-score using the mea

556 and standard deviation of the given data.

557

558 :param np.array x:

559 For dataframe e.g. df['a_col_name'].values

560 :param float limit: default 3

561 Lower limit for required z-score

562 :return: np.array iqr:

563 modified z score

564

565 Example:

566

567 >>> import numpy as np

568 >>> normal_dis = np.random.normal(0, 1, 1000)

569 >>> res = z_score(normal_dis, limit=2)

570 >>> values = normal_dis[res]

571

572 """

573 mean = np.mean(x)

574 standard_deviation = np.std(x)

575 z_score_value = (x - mean) / standard_deviation

576 return np.where(np.abs(z_score_value) > limit)[0]

577

578

579def modified_z_score(x, limit=3.5):

580 """

581 Calculate the modified z-score using the median

582 and median average deviation of the given data.

583

584 :param np.array x:

585 For dataframe e.g. df['a_col_name'].values

586 :param float limit: default 3.5

587 Lower limit for required z-score

588 :return: np.array iqr:

589 modified z score

590

591 Example:

592

593 >>> import numpy as np

594 >>> normal_dis = np.random.normal(0, 1, 1000)

595 >>> res = modified_z_score(normal_dis, limit=2)

596 >>> values = normal_dis[res]

597

598 """

599 median = np.median(x)

600 median_average_deviation = np.median(np.abs(x - median))

601 z_score_mod = 0.6745 * (x - median) / median_average_deviation

602 return np.where(np.abs(z_score_mod) > limit)[0]

603

604

605def interquartile_range(x):

606 """

607 Calculate interquartile range of given array.

608 Returns the indices of values outside of the interquartile range.

609

610 :param np.array x:

611 For dataframe e.g. df['a_col_name'].values

612 :return: np.array iqr:

613 Array matching the interquartile-range

614

615 Example:

616

617 >>> import numpy as np

618 >>> normal_dis = np.random.normal(0, 1, 1000)

619 >>> res = interquartile_range(normal_dis)

620 >>> values = normal_dis[res]

621

622 """

623 quartile_1, quartile_3 = np.percentile(x, [25, 75])

624 iqr = quartile_3 - quartile_1

625 lower = quartile_1 - (iqr * 1.5)

626 upper = quartile_3 + (iqr * 1.5)

627 return np.where((x > upper) | (x < lower))[0]

628

629

630def cross_validation(x, y, test_size=0.3):

631 """

632 Split data set randomly with test_size

633 (if test_size = 0.30 --> 70 % are training data).

634 You can use this function for segmentation tasks.

635 Time-series-data may not be splitted with this function

636 as the results are not coherent (time-wise).

637

638 :param x:

639 Indexables with same length / shape[0] as y.

640 Allowed inputs are lists, numpy arrays, scipy-sparse

641 matrices or pandas dataframes.

642 :param list,np.ndarray,pd.DataFrame y:

643 Indexables with same length / shape[0] as x.

644 Allowed inputs are lists, numpy arrays, scipy-sparse

645 matrices or pandas dataframes.

646 :param float test_size:

647 Value between 0 and 1 specifying what percentage of the data

648 will be used for testing.

649 :return: list

650 Split data into 4 objects. The order is:

651 x_train, x_test, y_train, y_test

652

653 Example:

654

655 >>> import numpy as np

656 >>> x = np.random.rand(100)

657 >>> y = np.random.rand(100)

658 >>> ret = cross_validation(x, y)

659 >>> len(ret)

660 4

661 """

662 return model_selection.train_test_split(x, y, test_size=test_size)

663

664

665def get_df_index_frequency_mean_and_std(df_index: pd.Index, verbose: bool = False):

666 """

667 Function to get the mean and std of the index-frequency.

668 If the index is a DatetimeIndex, the seconds are converted from nanoseconds

669 to seconds.

670 Else, seconds are assumed as values.

671

672 :param pd.Index df_index:

673 Time index.

674 :param bool verbose:

675 Default false. If true, additional to the mean value and standard deviation,

676 the standard error of the mean and number of time steps are returned.

677

678 :returns:

679 float: Mean value

680 float: Standard deviation

681 """

682

683 if isinstance(df_index, pd.DatetimeIndex):

684 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) * 1e-9

685 else:

686 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64)

687 if verbose:

688 return np.mean(index_in_s), np.std(index_in_s), st.sem(index_in_s), len(index_in_s)

689 else:

690 return np.mean(index_in_s), np.std(index_in_s)