Coverage for ebcpy/preprocessing.py: 95%
173 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-09-19 12:21 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-09-19 12:21 +0000
1"""
2This general overview may help you find the function you need:
4- Remove duplicate rows by averaging the values
5 (``build_average_on_duplicate_rows``)
6- Convert any integer or float index into a datetime index
7 (``convert_index_to_datetime_index``)
8- Resample a given time-series on a given frequency
9 (``clean_and_space_equally_time_series``)
10- Apply a low-pass-filter (``low_pass_filter``)
11- Apply a moving average to flatten disturbances
12 in your measured data (``moving_average``)
13- Convert e.g. an electrical power signal into a binary
14 control signal (on-off) based on a threshold (``create_on_off_signal``)
15- Find the number of lines without any values in it (``number_lines_totally_na``)
16- Split a data-set into training and test set according to
17 cross-validation (``cross_validation``)
19All functions in the pre-processing module should have a doctest. We refer to the example
20in this doctest for a better understanding of the functions. If you don't understand
21the behaviour of a function or the meaning, please raise an issue.
22"""
23import warnings
24import logging
25from datetime import datetime
26from scipy import signal
27from sklearn import model_selection
28from pandas.tseries.frequencies import to_offset
29import numpy as np
30import pandas as pd
31import scipy.stats as st
32from ebcpy import data_types
34logger = logging.getLogger(__name__)
37def build_average_on_duplicate_rows(df):
38 """
39 If the dataframe has duplicate-indexes, the average
40 value of all those indexes is calculated and given to
41 the first occurrence of this duplicate index. Therefore,
42 any dataFrame should be already sorted before calling this
43 function.
45 :param pd.DataFame df:
46 DataFrame with the data to process
47 :return: pd.DataFame
48 The processed DataFame
50 Example:
52 >>> df = pd.DataFrame({"idx": np.ones(5), "val": np.arange(5)}).set_index("idx")
53 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))
54 >>> print(df)
55 val
56 idx
57 2007-01-01 00:00:01 0
58 2007-01-01 00:00:01 1
59 2007-01-01 00:00:01 2
60 2007-01-01 00:00:01 3
61 2007-01-01 00:00:01 4
62 >>> print(build_average_on_duplicate_rows(df))
63 val
64 idx
65 2007-01-01 00:00:01 2.0
66 """
67 # Find entries that are exactly the same timestamp
68 double_ind = df.index[df.index.duplicated()].unique()
69 # Calculate the mean value
70 mean_values = []
71 for item in double_ind:
72 mean_values.append(df.loc[item].values.mean(axis=0))
73 # Delete duplicate indices
74 df_dropped = df[~df.index.duplicated(keep='first')].copy()
76 # Set mean values in rows that were duplicates before
77 for idx, values in zip(double_ind, mean_values):
78 df_dropped.loc[idx] = values
80 return df_dropped
83def convert_index_to_datetime_index(df, unit_of_index="s", origin=datetime.now(),
84 inplace: bool = False):
85 """
86 Converts the index of the given DataFrame to a
87 pandas.core.indexes.datetimes.DatetimeIndex.
89 :param pd.DataFrame df:
90 dataframe with index not being a DateTime.
91 Only numeric indexes are supported. Every integer
92 is interpreted with the given unit, standard form
93 is in seocnds.
94 :param str unit_of_index: default 's'
95 The unit of the given index. Used to convert to
96 total_seconds later on.
97 :param datetime.datetime origin:
98 The reference datetime object for the first index.
99 Default is the current system time.
100 :param bool inplace:
101 If True, performs operation inplace and returns None.
102 :return: df
103 Copy of DataFrame with correct index for usage in this
104 framework.
106 Example:
108 >>> import pandas as pd
109 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))
110 >>> print(df)
111 A B C D
112 0 1.0 1.0 1.0 1.0
113 1 1.0 1.0 1.0 1.0
114 2 1.0 1.0 1.0 1.0
115 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))
116 A B C D
117 2007-01-01 00:00:00 1.0 1.0 1.0 1.0
118 2007-01-01 00:00:01 1.0 1.0 1.0 1.0
119 2007-01-01 00:00:02 1.0 1.0 1.0 1.0
121 """
122 # Check for unit of given index. Maybe one uses hour-based data.
123 _unit_conversion_to_seconds = {"ms": 1e3,
124 "s": 1,
125 "min": 1 / 60,
126 "h": 1 / 3600,
127 "d": 1 / 86400}
128 if unit_of_index not in _unit_conversion_to_seconds:
129 raise ValueError("Given unit_of_index is not supported.")
130 _unit_factor_to_seconds = _unit_conversion_to_seconds.get(unit_of_index)
132 # Convert
133 old_index = df.index.copy()
134 # Check if already converted:
135 if isinstance(old_index, pd.DatetimeIndex):
136 return df
137 # Convert strings to numeric values.
138 old_index = pd.to_numeric(old_index)
139 # Convert to seconds.
140 old_index /= _unit_factor_to_seconds
141 # Alter the index
142 index = pd.to_datetime(old_index, unit="s", origin=origin)
143 if inplace:
144 df.index = index
145 return None
146 df_copy = df.copy()
147 df_copy.index = index
148 return df_copy
151def convert_datetime_index_to_float_index(df, offset=0, inplace: bool = False):
152 """
153 Convert a datetime-based index to FloatIndex (in seconds).
154 Seconds are used as a standard unit as simulation software
155 outputs data in seconds (e.g. Modelica)
157 :param pd.DataFrame df:
158 DataFrame to be converted to FloatIndex
159 :param float offset:
160 Offset in seconds
161 :param bool inplace:
162 If True, performs operation inplace and returns None.
163 :return: pd.DataFrame df:
164 DataFrame with correct index
166 Example:
168 >>> import pandas as pd
169 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))
170 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))
171 A B C D
172 2007-01-01 00:00:00 1.0 1.0 1.0 1.0
173 2007-01-01 00:00:01 1.0 1.0 1.0 1.0
174 2007-01-01 00:00:02 1.0 1.0 1.0 1.0
175 >>> print(convert_datetime_index_to_float_index(df))
176 A B C D
177 0.0 1.0 1.0 1.0 1.0
178 1.0 1.0 1.0 1.0 1.0
179 2.0 1.0 1.0 1.0 1.0
180 """
181 # Check correct input
182 if not isinstance(df.index, pd.DatetimeIndex):
183 raise IndexError("Given DataFrame has no DatetimeIndex, conversion not possible")
185 new_index = np.round(pd.to_timedelta(df.index - df.index[0]).total_seconds(), 4) + offset
186 if inplace:
187 df.index = new_index
188 return None
189 df_copy = df.copy()
190 df_copy.index = new_index
191 return df_copy
194def time_based_weighted_mean(df):
195 """
196 Creates the weighted mean according to time index that does not need to be equidistant.
197 Further info:
198 https://stackoverflow.com/questions/26343252/create-a-weighted-mean-for-a-irregular-timeseries-in-pandas
200 :param pd.DataFrame df:
201 A pandas DataFrame with DatetimeIndex.
202 :return np.array:
203 A numpy array containing weighted means of all columns
205 Example:
207 >>> from datetime import datetime
208 >>> import numpy as np
209 >>> import pandas as pd
210 >>> time_vec = [datetime(2007,1,1,0,0),
211 >>> datetime(2007,1,1,0,0),
212 >>> datetime(2007,1,1,0,5),
213 >>> datetime(2007,1,1,0,7),
214 >>> datetime(2007,1,1,0,10)]
215 >>> df = pd.DataFrame({'A': [1,2,4,3,6], 'B': [11,12,14,13,16]}, index=time_vec)
216 >>> print(time_based_weighted_mean(df=df))
217 [ 3.55 13.55]
218 """
220 if not isinstance(df.index, pd.DatetimeIndex):
221 raise IndexError(f"df.index must be DatetimeIndex, but it is {type(df.index)}.")
223 time_delta = [(x - y).total_seconds() for x, y in zip(df.index[1:], df.index[:-1])]
224 weights = [x + y for x, y in zip([0] + time_delta, time_delta + [0])]
225 # Create empty numpy array
226 res = np.empty(len(df.columns))
227 res[:] = np.nan
228 for i, col_name in enumerate(df.columns):
229 res[i] = np.average(df[col_name], weights=weights)
230 return res
233def clean_and_space_equally_time_series(df, desired_freq, confidence_warning=0.95):
234 """
235 Function for cleaning of the given dataFrame and interpolating
236 based on the given desired frequency. Linear interpolation
237 is used.
239 :param pd.DataFrame df:
240 Unclean DataFrame. Needs to have a pd.DateTimeIndex
241 :param str desired_freq:
242 Frequency to determine number of elements in processed dataframe.
243 Options are for example:
244 - s: second-based
245 - 5s: Every 5 seconds
246 - 6min: Every 6 minutes
247 This also works for h, d, m, y, ms etc.
248 :param float confidence_warning:
249 Value to check the confidence interval of input data without
250 a defined frequency. If the desired frequency is outside of
251 the resulting confidence interval, a warning is issued.
252 :return: pd.DataFrame
253 Cleaned and equally spaced data-frame
255 Example:
256 **Note:** The example is for random data. Try out different sampling
257 frequencys. You will be warned if the samping rate is to high or to low.
259 >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)),
260 >>> columns=list('ABCD')).set_index("A").sort_index()
261 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))
262 >>> clean_and_space_equally_time_series(df, "30s")
263 >>> import matplotlib.pyplot as plt
264 >>> plt.plot(df["B"], label="Raw data")
265 >>> df = clean_and_space_equally_time_series(df.copy(), "1500ms")
266 >>> plt.plot(df["B"], label="Clead and spaced equally")
267 >>> plt.legend()
268 >>> plt.show()
270 .. versionchanged:: 0.1.7
271 """
272 # Convert indexes to datetime_index:
273 if not isinstance(df.index, pd.DatetimeIndex):
274 if isinstance(df, data_types.TimeSeriesData):
275 raise TypeError("TimeSeriesData needs a DateTimeIndex for executing this function. "
276 "Call convert_index_to_datetime_index() to convert any index to "
277 "a DateTimeIndex")
278 # Else
279 raise TypeError("DataFrame needs a DateTimeIndex for executing this function. "
280 "Call convert_index_to_datetime_index() to convert any index to "
281 "a DateTimeIndex")
282 # %% Check DataFrame for NANs
283 # Create a pandas Series with number of invalid values for each column of df
284 series_with_na = df.isnull().sum()
285 for name in series_with_na.index:
286 if series_with_na.loc[name] > 0:
287 # Print only columns with invalid values
288 logger.info("%s has following number of invalid "
289 "values\n %s", name, series_with_na.loc[name])
290 # Drop all rows where at least one NA exists
291 df_temp = df.dropna(how='any')
293 # Check if DataFrame still has non-numeric-values:
294 if not all(df_temp.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())):
295 raise ValueError("Given DataFrame contains non-numeric values.")
297 # Merge duplicate rows using mean.
298 df_temp = build_average_on_duplicate_rows(df_temp)
300 # Make user warning for two cases: Upsampling and data input without a freq:
301 # Check if the frequency differs
302 old_freq, old_freq_std, old_freq_sem, time_steps = get_df_index_frequency_mean_and_std(
303 df_index=df_temp.index,
304 verbose=True)
305 if old_freq_std > 0:
306 _ns_to_s = 1e9
307 # Calculate confidence interval of the mean value of the old frequency
308 cfd_int = st.t.interval(confidence_warning,
309 time_steps - 1,
310 loc=old_freq,
311 scale=old_freq_sem)
312 # Convert to timedelta
313 cfd_int = pd.to_timedelta((cfd_int[0] * _ns_to_s, cfd_int[1] * _ns_to_s))
314 _td_freq = pd.to_timedelta(desired_freq)
315 if (_td_freq < cfd_int[0]) or (_td_freq > cfd_int[1]):
316 in_seconds = np.array(cfd_int.values.tolist()) / _ns_to_s # From nanoseconds
317 warnings.warn(f"Input data has no frequency, but the desired frequency "
318 f"{_td_freq.value / _ns_to_s} seconds is outside the given "
319 f"confidence interval {in_seconds} (in seconds) "
320 "Carefully check the result to see if you "
321 "introduced errors to the data.")
323 # %% Re-sampling to new frequency with linear interpolation
324 # Create new equally spaced DatetimeIndex. Last entry is always < df.index[-1]
325 time_index = pd.date_range(start=df.index[0], end=df.index[-1], freq=desired_freq)
326 new_freq, _ = get_df_index_frequency_mean_and_std(df_index=time_index)
328 # Check if the user is trying to upsample the data:
329 if old_freq_std == 0:
330 if new_freq > old_freq:
331 warnings.warn("You are upsampling your data. This may be dangerous. "
332 "Carefully check the result to see if you introduced errors to the data.")
334 # Create an empty data frame
335 # If multi-columns is used, first get the old index and make it empty:
336 multi_cols = df_temp.columns
337 if isinstance(multi_cols, pd.MultiIndex):
338 empty_multi_cols = pd.MultiIndex.from_product([[] for _ in range(multi_cols.nlevels)],
339 names=multi_cols.names)
340 df_time_temp = pd.DataFrame(index=time_index, columns=empty_multi_cols)
341 else:
342 df_time_temp = pd.DataFrame(index=time_index)
344 # Insert temporary time_index into df. fill_value = 0 can only be used,
345 # since all NaNs should be eliminated prior
346 df_temp = df_temp.radd(df_time_temp, axis='index', fill_value=0)
347 del df_time_temp
349 # Interpolate linearly according to time index
350 df_temp.interpolate(method='time', axis=0, inplace=True)
351 # Determine Timedelta between current first index entry
352 # in df and the first index entry that would be created
353 # when applying df.resample() without loffset
354 delta_time = df.index[0] - \
355 df_temp.resample(rule=desired_freq).first().first(desired_freq).index[0]
356 # Resample to equally spaced index.
357 # All fields should already have a value. Thus NaNs and maybe +/- infs
358 # should have been filtered beforehand.
360 # Check if given dataframe was a TimeSeriesData object and of so, convert it as such
361 if isinstance(df_temp, data_types.TimeSeriesData):
362 df_temp = df_temp.resample(rule=desired_freq).first()
363 df_temp.index = df_temp.index + to_offset(delta_time)
364 df_temp = data_types.TimeSeriesData(df_temp)
365 else:
366 df_temp = df_temp.resample(rule=desired_freq).first()
367 df_temp.index = df_temp.index + to_offset(delta_time)
368 del delta_time
370 return df_temp
373def low_pass_filter(data, crit_freq, filter_order):
374 """
375 Create a low pass filter with given order and frequency.
377 :param numpy.ndarray data:
378 For dataframe e.g. df['a_col_name'].values
379 :param float crit_freq:
380 The critical frequency or frequencies.
381 :param int filter_order:
382 The order of the filter
383 :return: numpy.ndarray
385 Example:
387 >>> import numpy as np
388 >>> import matplotlib.pyplot as plt
389 >>> rand_series = np.random.rand(100)
390 >>> plt.plot(rand_series, label="reference")
391 >>> plt.plot(low_pass_filter(rand_series, 0.2, 2), label="filtered")
392 >>> plt.legend()
393 >>> plt.show()
395 """
396 if len(data.shape) > 1: # Check if given data has multiple dimensions
397 if data.shape[1] == 1:
398 data = data[:, 0] # Resize to 1D-Array
399 else:
400 raise ValueError("Given data has multiple dimensions. "
401 "Only one-dimensional arrays are supported in this function.")
402 _filter_order = int(filter_order)
403 numerator, denominator = signal.butter(N=_filter_order, Wn=crit_freq,
404 btype='low', analog=False, output='ba')
405 output = signal.filtfilt(numerator, denominator, data)
406 return output
409def moving_average(data, window):
410 """
411 Creates a pandas Series as moving average of the input series.
413 :param pd.Series data:
414 For dataframe e.g. df['a_col_name'].values
415 :param int window:
416 sample rate of input
417 :return: numpy.array
418 shape has (###,). First and last points of input Series are extrapolated as constant
419 values (hold first and last point).
421 Example:
423 >>> import numpy as np
424 >>> import matplotlib.pyplot as plt
425 >>> series = np.sin(np.linspace(-30, 30, 1000))
426 >>> plt.plot(series, label="reference")
427 >>> plt.plot(moving_average(series, 10), label="window=10")
428 >>> plt.plot(moving_average(series, 50), label="window=50")
429 >>> plt.plot(moving_average(series, 100), label="window=100")
430 >>> plt.legend()
431 >>> plt.show()
433 """
434 if len(data.shape) > 1: # Check if given data has multiple dimensions
435 if data.shape[1] == 1:
436 data = data[:, 0] # Resize to 1D-Array
437 else:
438 raise ValueError("Given data has multiple dimensions. "
439 "Only one-dimensional arrays are supported in this function.")
440 window = int(window)
441 weights = np.repeat(1.0, window) / window
442 sma = np.convolve(data, weights, 'valid')
443 # Create array with first entries and window/2 elements
444 fill_start = np.full((int(np.floor(window / 2)), 1), sma[0])
445 # Same with last value of -data-
446 fill_end = np.full((int(np.ceil(window / 2)) - 1, 1), sma[-1])
447 # Stack the arrays
448 sma = np.concatenate((fill_start[:, 0], sma, fill_end[:, 0]), axis=0)
449 return sma
452def create_on_off_signal(df, col_names, threshold, col_names_new,
453 tags="raw", new_tag="converted_signal"):
454 """
455 Create on and off signals based on the given threshold for all column names.
457 :param pd.DataFame df:
458 DataFrame with the data to process
459 :param list col_names:
460 Column names of variables to convert to signals
461 :param float,list threshold:
462 Threshold for all column-names (single float) or
463 a list with specific thresholds for specific columns.
464 :param list col_names_new:
465 New name for the signal-column
466 :param str,list tags:
467 If a 2-Level DataFrame for TimeSeriesData is used, one has to
468 specify the tag of the variables. Default value is to use the "raw"
469 tag set in the TimeSeriesClass. However, one can specify a list
470 (Different tag for each variable), or on can pass a string
471 (same tags for all given variables)
472 :param str new_tag:
473 The tag the newly created variable will hold. This can be used to
474 indicate where the signal was converted from.
475 :return: pd.DataFrame
476 Copy of DataFrame with the created signals added.
478 Example:
480 >>> import matplotlib.pyplot as plt
481 >>> import numpy as np
482 >>> df = pd.DataFrame({"P_el": np.sin(np.linspace(-20, 20, 10000))*100})
483 >>> df = create_on_off_signal(df, col_names=["P_el"],
484 >>> threshold=25, col_names_new=["Device On"])
485 >>> plt.plot(df)
486 >>> plt.show()
487 """
488 if len(col_names) != len(col_names_new):
489 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "
490 f"col_names_new: {len(col_names_new)}")
491 if isinstance(threshold, list):
492 if len(col_names) != len(threshold):
493 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "
494 f"threshold: {len(threshold)}")
495 else:
496 threshold = [threshold for _ in enumerate(col_names)]
497 # Do on_off signal creation for all desired columns
498 df_copy = df.copy()
499 if isinstance(df.columns, pd.MultiIndex):
500 # Convert given tags to a list
501 if isinstance(tags, str):
502 tags = [tags for _ in enumerate(col_names)]
504 for i, _ in enumerate(col_names):
505 # Create zero-array
506 df_copy.loc[:, (col_names_new[i], new_tag)] = 0.0
507 # Change all values to 1.0 according to threshold
508 df_copy.loc[
509 df_copy[col_names[i], tags[i]] >= threshold[i], (col_names_new[i], new_tag)] = 1.0
510 else:
511 for i, _ in enumerate(col_names):
512 # Create zero-array
513 df_copy.loc[:, col_names_new[i]] = 0.0
514 # Change all values to 1.0 according to threshold
515 df_copy.loc[df_copy[col_names[i]] >= threshold[i], col_names_new[i]] = 1.0
516 return df_copy
519def number_lines_totally_na(df):
520 """
521 Returns the number of rows in the given dataframe
522 that are filled with NaN-values.
524 :param pd.DataFrame df:
525 Given dataframe to process
526 :return: int
527 Number of NaN-Rows.
529 Example:
531 >>> import numpy as np
532 >>> import pandas as pd
533 >>> dim = np.random.randint(100) + 10
534 >>> nan_col = [np.NaN for i in range(dim)]
535 >>> col = [i for i in range(dim)]
536 >>> df_nan = pd.DataFrame({"col_1":nan_col, "col_2":nan_col})
537 >>> df_normal = pd.DataFrame({"col_1":nan_col, "col_2":col})
538 >>> print(number_lines_totally_na(df_nan)-dim)
539 0
540 >>> print(number_lines_totally_na(df_normal))
541 0
542 """
543 if not isinstance(df, pd.DataFrame):
544 raise TypeError('Input must be a pandas data frame')
545 counter = 0
546 for _, row in df.iterrows():
547 # Check if the whole row is filled with NaNs.
548 if all(row.isnull()):
549 counter += 1
550 return counter
553def z_score(x, limit=3):
554 """
555 Calculate the z-score using the mea
556 and standard deviation of the given data.
558 :param np.array x:
559 For dataframe e.g. df['a_col_name'].values
560 :param float limit: default 3
561 Lower limit for required z-score
562 :return: np.array iqr:
563 modified z score
565 Example:
567 >>> import numpy as np
568 >>> normal_dis = np.random.normal(0, 1, 1000)
569 >>> res = z_score(normal_dis, limit=2)
570 >>> values = normal_dis[res]
572 """
573 mean = np.mean(x)
574 standard_deviation = np.std(x)
575 z_score_value = (x - mean) / standard_deviation
576 return np.where(np.abs(z_score_value) > limit)[0]
579def modified_z_score(x, limit=3.5):
580 """
581 Calculate the modified z-score using the median
582 and median average deviation of the given data.
584 :param np.array x:
585 For dataframe e.g. df['a_col_name'].values
586 :param float limit: default 3.5
587 Lower limit for required z-score
588 :return: np.array iqr:
589 modified z score
591 Example:
593 >>> import numpy as np
594 >>> normal_dis = np.random.normal(0, 1, 1000)
595 >>> res = modified_z_score(normal_dis, limit=2)
596 >>> values = normal_dis[res]
598 """
599 median = np.median(x)
600 median_average_deviation = np.median(np.abs(x - median))
601 z_score_mod = 0.6745 * (x - median) / median_average_deviation
602 return np.where(np.abs(z_score_mod) > limit)[0]
605def interquartile_range(x):
606 """
607 Calculate interquartile range of given array.
608 Returns the indices of values outside of the interquartile range.
610 :param np.array x:
611 For dataframe e.g. df['a_col_name'].values
612 :return: np.array iqr:
613 Array matching the interquartile-range
615 Example:
617 >>> import numpy as np
618 >>> normal_dis = np.random.normal(0, 1, 1000)
619 >>> res = interquartile_range(normal_dis)
620 >>> values = normal_dis[res]
622 """
623 quartile_1, quartile_3 = np.percentile(x, [25, 75])
624 iqr = quartile_3 - quartile_1
625 lower = quartile_1 - (iqr * 1.5)
626 upper = quartile_3 + (iqr * 1.5)
627 return np.where((x > upper) | (x < lower))[0]
630def cross_validation(x, y, test_size=0.3):
631 """
632 Split data set randomly with test_size
633 (if test_size = 0.30 --> 70 % are training data).
634 You can use this function for segmentation tasks.
635 Time-series-data may not be splitted with this function
636 as the results are not coherent (time-wise).
638 :param x:
639 Indexables with same length / shape[0] as y.
640 Allowed inputs are lists, numpy arrays, scipy-sparse
641 matrices or pandas dataframes.
642 :param list,np.ndarray,pd.DataFrame y:
643 Indexables with same length / shape[0] as x.
644 Allowed inputs are lists, numpy arrays, scipy-sparse
645 matrices or pandas dataframes.
646 :param float test_size:
647 Value between 0 and 1 specifying what percentage of the data
648 will be used for testing.
649 :return: list
650 Split data into 4 objects. The order is:
651 x_train, x_test, y_train, y_test
653 Example:
655 >>> import numpy as np
656 >>> x = np.random.rand(100)
657 >>> y = np.random.rand(100)
658 >>> ret = cross_validation(x, y)
659 >>> len(ret)
660 4
661 """
662 return model_selection.train_test_split(x, y, test_size=test_size)
665def get_df_index_frequency_mean_and_std(df_index: pd.Index, verbose: bool = False):
666 """
667 Function to get the mean and std of the index-frequency.
668 If the index is a DatetimeIndex, the seconds are converted from nanoseconds
669 to seconds.
670 Else, seconds are assumed as values.
672 :param pd.Index df_index:
673 Time index.
674 :param bool verbose:
675 Default false. If true, additional to the mean value and standard deviation,
676 the standard error of the mean and number of time steps are returned.
678 :returns:
679 float: Mean value
680 float: Standard deviation
681 """
683 if isinstance(df_index, pd.DatetimeIndex):
684 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) * 1e-9
685 else:
686 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64)
687 if verbose:
688 return np.mean(index_in_s), np.std(index_in_s), st.sem(index_in_s), len(index_in_s)
689 else:
690 return np.mean(index_in_s), np.std(index_in_s)