Coverage for ebcpy/preprocessing.py: 92%
177 statements
« prev ^ index » next coverage.py v7.4.4, created at 2025-08-26 09:12 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2025-08-26 09:12 +0000
1"""
2This general overview may help you find the function you need:
4- Remove duplicate rows by averaging the values
5 (``build_average_on_duplicate_rows``)
6- Convert any integer or float index into a datetime index
7 (``convert_index_to_datetime_index``)
8- Resample a given time-series on a given frequency
9 (``clean_and_space_equally_time_series``)
10- Apply a low-pass-filter (``low_pass_filter``)
11- Apply a moving average to flatten disturbances
12 in your measured data (``moving_average``)
13- Convert e.g. an electrical power signal into a binary
14 control signal (on-off) based on a threshold (``create_on_off_signal``)
15- Find the number of lines without any values in it (``number_lines_totally_na``)
16- Split a data-set into training and test set according to
17 cross-validation (``cross_validation``)
19All functions in the pre-processing module should have a doctest. We refer to the example
20in this doctest for a better understanding of the functions. If you don't understand
21the behaviour of a function or the meaning, please raise an issue.
22"""
23import warnings
24import logging
25from typing import Union, TYPE_CHECKING
27from datetime import datetime
28from scipy import signal
29from sklearn import model_selection
30from pandas.tseries.frequencies import to_offset
31import numpy as np
32import pandas as pd
33import scipy.stats as st
35if TYPE_CHECKING:
36 from ebcpy import TimeSeriesData
38logger = logging.getLogger(__name__)
41def build_average_on_duplicate_rows(df: Union[pd.DataFrame, "TimeSeriesData"]) -> pd.DataFrame:
42 """
43 If the dataframe has duplicate-indexes, the average
44 value of all those indexes is calculated and given to
45 the first occurrence of this duplicate index. Therefore,
46 any dataFrame should be already sorted before calling this
47 function.
49 :param pd.DataFame df:
50 DataFrame with the data to process
51 :return: pd.DataFame
52 The processed DataFame
54 Example:
56 >>> df = pd.DataFrame({"idx": np.ones(5), "val": np.arange(5)}).set_index("idx")
57 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))
58 >>> print(df)
59 val
60 idx
61 2007-01-01 00:00:01 0
62 2007-01-01 00:00:01 1
63 2007-01-01 00:00:01 2
64 2007-01-01 00:00:01 3
65 2007-01-01 00:00:01 4
66 >>> print(build_average_on_duplicate_rows(df))
67 val
68 idx
69 2007-01-01 00:00:01 2.0
70 """
71 # Find entries that are exactly the same timestamp
72 double_ind = df.index[df.index.duplicated()].unique()
73 # Calculate the mean value
74 mean_values = []
75 for item in double_ind:
76 mean_values.append(df.loc[item].values.mean(axis=0))
77 # Delete duplicate indices
78 df_dropped = df[~df.index.duplicated(keep='first')].copy()
80 # Set mean values in rows that were duplicates before
81 for idx, values in zip(double_ind, mean_values):
82 df_dropped.loc[idx] = values
84 return df_dropped
87def convert_index_to_datetime_index(
88 df: Union[pd.DataFrame, "TimeSeriesData"],
89 unit_of_index: str = "s",
90 origin: datetime = datetime.now(),
91 inplace: bool = False
92) -> pd.DataFrame:
93 """
94 Converts the index of the given DataFrame to a
95 pandas.core.indexes.datetimes.DatetimeIndex.
97 :param pd.DataFrame,TimeSeriesData df:
98 dataframe with index not being a DateTime.
99 Only numeric indexes are supported. Every integer
100 is interpreted with the given unit, standard form
101 is in seocnds.
102 :param str unit_of_index: default 's'
103 The unit of the given index. Used to convert to
104 total_seconds later on.
105 :param datetime.datetime origin:
106 The reference datetime object for the first index.
107 Default is the current system time.
108 :param bool inplace:
109 If True, performs operation inplace and returns None.
110 :return: df
111 Copy of DataFrame with correct index for usage in this
112 framework.
114 Example:
116 >>> import pandas as pd
117 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))
118 >>> print(df)
119 A B C D
120 0 1.0 1.0 1.0 1.0
121 1 1.0 1.0 1.0 1.0
122 2 1.0 1.0 1.0 1.0
123 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))
124 A B C D
125 2007-01-01 00:00:00 1.0 1.0 1.0 1.0
126 2007-01-01 00:00:01 1.0 1.0 1.0 1.0
127 2007-01-01 00:00:02 1.0 1.0 1.0 1.0
129 """
130 # Check for unit of given index. Maybe one uses hour-based data.
131 _unit_conversion_to_seconds = {"ms": 1e3,
132 "s": 1,
133 "min": 1 / 60,
134 "h": 1 / 3600,
135 "d": 1 / 86400}
136 if unit_of_index not in _unit_conversion_to_seconds:
137 raise ValueError("Given unit_of_index is not supported.")
138 _unit_factor_to_seconds = _unit_conversion_to_seconds.get(unit_of_index)
140 # Convert
141 old_index = df.index.copy()
142 # Check if already converted:
143 if isinstance(old_index, pd.DatetimeIndex):
144 return df
145 # Convert strings to numeric values.
146 old_index = pd.to_numeric(old_index)
147 # Convert to seconds.
148 old_index /= _unit_factor_to_seconds
149 # Alter the index
150 index = pd.to_datetime(old_index, unit="s", origin=origin)
151 if inplace:
152 df.index = index
153 return None
154 df_copy = df.copy()
155 df_copy.index = index
156 return df_copy
159def convert_datetime_index_to_float_index(
160 df: Union[pd.DataFrame, "TimeSeriesData"],
161 offset: float = 0,
162 inplace: bool = False
163) -> pd.DataFrame:
164 """
165 Convert a datetime-based index to FloatIndex (in seconds).
166 Seconds are used as a standard unit as simulation software
167 outputs data in seconds (e.g. Modelica)
169 :param pd.DataFrame,TimeSeriesData df:
170 DataFrame to be converted to FloatIndex
171 :param float offset:
172 Offset in seconds
173 :param bool inplace:
174 If True, performs operation inplace and returns None.
175 :return: pd.DataFrame df:
176 DataFrame with correct index
178 Example:
180 >>> import pandas as pd
181 >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))
182 >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))
183 A B C D
184 2007-01-01 00:00:00 1.0 1.0 1.0 1.0
185 2007-01-01 00:00:01 1.0 1.0 1.0 1.0
186 2007-01-01 00:00:02 1.0 1.0 1.0 1.0
187 >>> print(convert_datetime_index_to_float_index(df))
188 A B C D
189 0.0 1.0 1.0 1.0 1.0
190 1.0 1.0 1.0 1.0 1.0
191 2.0 1.0 1.0 1.0 1.0
192 """
193 # Check correct input
194 if not isinstance(df.index, pd.DatetimeIndex):
195 raise IndexError("Given DataFrame has no DatetimeIndex, conversion not possible")
197 new_index = np.round(pd.to_timedelta(df.index - df.index[0]).total_seconds(), 4) + offset
198 if inplace:
199 df.index = new_index
200 return None
201 df_copy = df.copy()
202 df_copy.index = new_index
203 return df_copy
206def time_based_weighted_mean(df: Union[pd.DataFrame, "TimeSeriesData"]) -> np.ndarray:
207 """
208 Creates the weighted mean according to time index that does not need to be equidistant.
209 Further info:
210 https://stackoverflow.com/questions/26343252/create-a-weighted-mean-for-a-irregular-timeseries-in-pandas
212 :param pd.DataFrame df:
213 A pandas DataFrame with DatetimeIndex.
214 :return np.array:
215 A numpy array containing weighted means of all columns
217 Example:
219 >>> from datetime import datetime
220 >>> import numpy as np
221 >>> import pandas as pd
222 >>> time_vec = [datetime(2007,1,1,0,0),
223 >>> datetime(2007,1,1,0,0),
224 >>> datetime(2007,1,1,0,5),
225 >>> datetime(2007,1,1,0,7),
226 >>> datetime(2007,1,1,0,10)]
227 >>> df = pd.DataFrame({'A': [1,2,4,3,6], 'B': [11,12,14,13,16]}, index=time_vec)
228 >>> print(time_based_weighted_mean(df=df))
229 [ 3.55 13.55]
230 """
232 if not isinstance(df.index, pd.DatetimeIndex):
233 raise IndexError(f"df.index must be DatetimeIndex, but it is {type(df.index)}.")
235 time_delta = [(x - y).total_seconds() for x, y in zip(df.index[1:], df.index[:-1])]
236 weights = [x + y for x, y in zip([0] + time_delta, time_delta + [0])]
237 # Create empty numpy array
238 res = np.empty(len(df.columns))
239 res[:] = np.nan
240 for i, col_name in enumerate(df.columns):
241 res[i] = np.average(df[col_name], weights=weights)
242 return res
245def clean_and_space_equally_time_series(
246 df: Union[pd.DataFrame, "TimeSeriesData"],
247 desired_freq: str,
248 confidence_warning: float = 0.95
249) -> pd.DataFrame:
250 """
251 Function for cleaning of the given dataFrame and interpolating
252 based on the given desired frequency. Linear interpolation
253 is used.
255 :param pd.DataFrame,TimeSeriesData df:
256 Unclean DataFrame. Needs to have a pd.DateTimeIndex
257 :param str desired_freq:
258 Frequency to determine number of elements in processed dataframe.
259 Options are for example:
260 - s: second-based
261 - 5s: Every 5 seconds
262 - 6min: Every 6 minutes
263 This also works for h, d, m, y, ms etc.
264 :param float confidence_warning:
265 Value to check the confidence interval of input data without
266 a defined frequency. If the desired frequency is outside of
267 the resulting confidence interval, a warning is issued.
268 :return: pd.DataFrame
269 Cleaned and equally spaced data-frame
271 Example:
272 **Note:** The example is for random data. Try out different sampling
273 frequencys. You will be warned if the samping rate is to high or to low.
275 >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)),
276 >>> columns=list('ABCD')).set_index("A").sort_index()
277 >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))
278 >>> clean_and_space_equally_time_series(df, "30s")
279 >>> import matplotlib.pyplot as plt
280 >>> plt.plot(df["B"], label="Raw data")
281 >>> df = clean_and_space_equally_time_series(df.copy(), "1500ms")
282 >>> plt.plot(df["B"], label="Clead and spaced equally")
283 >>> plt.legend()
284 >>> plt.show()
286 .. versionchanged:: 0.1.7
287 """
288 from ebcpy import TimeSeriesData
290 # Convert indexes to datetime_index:
291 if not isinstance(df.index, pd.DatetimeIndex):
292 if isinstance(df, TimeSeriesData):
293 raise TypeError("TimeSeriesData needs a DateTimeIndex for executing this function. "
294 "Call to_datetime_index() to convert any index to "
295 "a DateTimeIndex")
296 # Else
297 raise TypeError("DataFrame needs a DateTimeIndex for executing this function. "
298 "Call convert_index_to_datetime_index() to convert any index to "
299 "a DateTimeIndex")
300 # %% Check DataFrame for NANs
301 # Create a pandas Series with number of invalid values for each column of df
302 series_with_na = df.isnull().sum()
303 for name in series_with_na.index:
304 if series_with_na.loc[name] > 0:
305 # Print only columns with invalid values
306 logger.info("%s has following number of invalid "
307 "values\n %s", name, series_with_na.loc[name])
308 # Drop all rows where at least one NA exists
309 df_temp = df.dropna(how='any')
311 # Check if DataFrame still has non-numeric-values:
312 if not all(df_temp.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())):
313 raise ValueError("Given DataFrame contains non-numeric values.")
315 # Merge duplicate rows using mean.
316 df_temp = build_average_on_duplicate_rows(df_temp)
318 # Make user warning for two cases: Upsampling and data input without a freq:
319 # Check if the frequency differs
320 old_freq, old_freq_std, old_freq_sem, time_steps = get_df_index_frequency_mean_and_std(
321 df_index=df_temp.index,
322 verbose=True)
323 if old_freq_std > 0:
324 _ns_to_s = 1e9
325 # Calculate confidence interval of the mean value of the old frequency
326 cfd_int = st.t.interval(confidence_warning,
327 time_steps - 1,
328 loc=old_freq,
329 scale=old_freq_sem)
330 # Convert to timedelta
331 cfd_int = pd.to_timedelta((cfd_int[0] * _ns_to_s, cfd_int[1] * _ns_to_s))
332 _td_freq = pd.to_timedelta(desired_freq)
333 if (_td_freq < cfd_int[0]) or (_td_freq > cfd_int[1]):
334 in_seconds = np.array(cfd_int.values.tolist()) / _ns_to_s # From nanoseconds
335 warnings.warn(f"Input data has no frequency, but the desired frequency "
336 f"{_td_freq.value / _ns_to_s} seconds is outside the given "
337 f"confidence interval {in_seconds} (in seconds) "
338 "Carefully check the result to see if you "
339 "introduced errors to the data.")
341 # %% Re-sampling to new frequency with linear interpolation
342 # Create new equally spaced DatetimeIndex. Last entry is always < df.index[-1]
343 time_index = pd.date_range(start=df.index[0], end=df.index[-1], freq=desired_freq)
344 new_freq, _ = get_df_index_frequency_mean_and_std(df_index=time_index)
346 # Check if the user is trying to upsample the data:
347 if old_freq_std == 0:
348 if new_freq > old_freq:
349 warnings.warn("You are upsampling your data. This may be dangerous. "
350 "Carefully check the result to see if you introduced errors to the data.")
352 # Create an empty data frame
353 # If multi-columns is used, first get the old index and make it empty:
354 multi_cols = df_temp.columns
355 if isinstance(multi_cols, pd.MultiIndex):
356 empty_multi_cols = pd.MultiIndex.from_product([[] for _ in range(multi_cols.nlevels)],
357 names=multi_cols.names)
358 df_time_temp = pd.DataFrame(index=time_index, columns=empty_multi_cols)
359 else:
360 df_time_temp = pd.DataFrame(index=time_index)
362 # Insert temporary time_index into df. fill_value = 0 can only be used,
363 # since all NaNs should be eliminated prior
364 df_temp = df_temp.radd(df_time_temp, axis='index', fill_value=0)
365 del df_time_temp
367 # Interpolate linearly according to time index
368 df_temp.interpolate(method='time', axis=0, inplace=True)
369 # Determine Timedelta between current first index entry
370 # in df and the first index entry that would be created
371 # when applying df.resample() without loffset
372 delta_time = df.index[0] - \
373 df_temp.resample(rule=desired_freq).first().first(desired_freq).index[0]
374 # Resample to equally spaced index.
375 # All fields should already have a value. Thus NaNs and maybe +/- infs
376 # should have been filtered beforehand.
378 # Check if given dataframe was a TimeSeriesData object and of so, convert it as such
379 from ebcpy import TimeSeriesData
380 if isinstance(df_temp, TimeSeriesData):
381 df_temp = df_temp.resample(rule=desired_freq).first()
382 df_temp.index = df_temp.index + to_offset(delta_time)
383 df_temp = TimeSeriesData(df_temp)
384 else:
385 df_temp = df_temp.resample(rule=desired_freq).first()
386 df_temp.index = df_temp.index + to_offset(delta_time)
387 del delta_time
389 return df_temp
392def low_pass_filter(data: np.ndarray, crit_freq: float, filter_order: int) -> np.ndarray:
393 """
394 Create a low pass filter with given order and frequency.
396 :param numpy.ndarray data:
397 For dataframe e.g. df['a_col_name'].values
398 :param float crit_freq:
399 The critical frequency or frequencies.
400 :param int filter_order:
401 The order of the filter
402 :return: numpy.ndarray
404 Example:
406 >>> import numpy as np
407 >>> import matplotlib.pyplot as plt
408 >>> rand_series = np.random.rand(100)
409 >>> plt.plot(rand_series, label="reference")
410 >>> plt.plot(low_pass_filter(rand_series, 0.2, 2), label="filtered")
411 >>> plt.legend()
412 >>> plt.show()
414 """
415 if len(data.shape) > 1: # Check if given data has multiple dimensions
416 if data.shape[1] == 1:
417 data = data[:, 0] # Resize to 1D-Array
418 else:
419 raise ValueError("Given data has multiple dimensions. "
420 "Only one-dimensional arrays are supported in this function.")
421 _filter_order = int(filter_order)
422 numerator, denominator = signal.butter(N=_filter_order, Wn=crit_freq,
423 btype='low', analog=False, output='ba')
424 output = signal.filtfilt(numerator, denominator, data)
425 return output
428def moving_average(data: np.ndarray, window: int) -> np.ndarray:
429 """
430 Creates a pandas Series as moving average of the input series.
432 :param np.ndarray data:
433 For dataframe e.g. df['a_col_name'].values
434 :param int window:
435 sample rate of input
436 :return: numpy.array
437 shape has (###,). First and last points of input Series are extrapolated as constant
438 values (hold first and last point).
440 Example:
442 >>> import numpy as np
443 >>> import matplotlib.pyplot as plt
444 >>> series = np.sin(np.linspace(-30, 30, 1000))
445 >>> plt.plot(series, label="reference")
446 >>> plt.plot(moving_average(series, 10), label="window=10")
447 >>> plt.plot(moving_average(series, 50), label="window=50")
448 >>> plt.plot(moving_average(series, 100), label="window=100")
449 >>> plt.legend()
450 >>> plt.show()
452 """
453 if len(data.shape) > 1: # Check if given data has multiple dimensions
454 if data.shape[1] == 1:
455 data = data[:, 0] # Resize to 1D-Array
456 else:
457 raise ValueError("Given data has multiple dimensions. "
458 "Only one-dimensional arrays are supported in this function.")
459 window = int(window)
460 weights = np.repeat(1.0, window) / window
461 sma = np.convolve(data, weights, 'valid')
462 # Create array with first entries and window/2 elements
463 fill_start = np.full((int(np.floor(window / 2)), 1), sma[0])
464 # Same with last value of -data-
465 fill_end = np.full((int(np.ceil(window / 2)) - 1, 1), sma[-1])
466 # Stack the arrays
467 sma = np.concatenate((fill_start[:, 0], sma, fill_end[:, 0]), axis=0)
468 return sma
471def create_on_off_signal(
472 df: Union[pd.DataFrame, "TimeSeriesData"],
473 col_names: list,
474 threshold: Union[float, list],
475 col_names_new: list,
476 tags: Union[list, str] = "raw",
477 new_tag: str = "converted_signal"
478):
479 """
480 Create on and off signals based on the given threshold for all column names.
482 :param pd.DataFame,TimeSeriesData df:
483 DataFrame with the data to process
484 :param list col_names:
485 Column names of variables to convert to signals
486 :param float,list threshold:
487 Threshold for all column-names (single float) or
488 a list with specific thresholds for specific columns.
489 :param list col_names_new:
490 New name for the signal-column
491 :param str,list tags:
492 If a 2-Level DataFrame for TimeSeriesData is used, one has to
493 specify the tag of the variables. Default value is to use the "raw"
494 tag set in the TimeSeriesClass. However, one can specify a list
495 (Different tag for each variable), or on can pass a string
496 (same tags for all given variables)
497 :param str new_tag:
498 The tag the newly created variable will hold. This can be used to
499 indicate where the signal was converted from.
500 :return: pd.DataFrame
501 Copy of DataFrame with the created signals added.
503 Example:
505 >>> import matplotlib.pyplot as plt
506 >>> import numpy as np
507 >>> df = pd.DataFrame({"P_el": np.sin(np.linspace(-20, 20, 10000))*100})
508 >>> df = create_on_off_signal(df, col_names=["P_el"],
509 >>> threshold=25, col_names_new=["Device On"])
510 >>> plt.plot(df)
511 >>> plt.show()
512 """
513 if len(col_names) != len(col_names_new):
514 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "
515 f"col_names_new: {len(col_names_new)}")
516 if isinstance(threshold, list):
517 if len(col_names) != len(threshold):
518 raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "
519 f"threshold: {len(threshold)}")
520 else:
521 threshold = [threshold for _ in enumerate(col_names)]
522 # Do on_off signal creation for all desired columns
523 df_copy = df.copy()
524 if isinstance(df.columns, pd.MultiIndex):
525 # Convert given tags to a list
526 if isinstance(tags, str):
527 tags = [tags for _ in enumerate(col_names)]
529 for i, _ in enumerate(col_names):
530 # Create zero-array
531 df_copy.loc[:, (col_names_new[i], new_tag)] = 0.0
532 # Change all values to 1.0 according to threshold
533 df_copy.loc[
534 df_copy[col_names[i], tags[i]] >= threshold[i], (col_names_new[i], new_tag)] = 1.0
535 else:
536 for i, _ in enumerate(col_names):
537 # Create zero-array
538 df_copy.loc[:, col_names_new[i]] = 0.0
539 # Change all values to 1.0 according to threshold
540 df_copy.loc[df_copy[col_names[i]] >= threshold[i], col_names_new[i]] = 1.0
541 return df_copy
544def number_lines_totally_na(df: Union[pd.DataFrame, "TimeSeriesData"]) -> int:
545 """
546 Returns the number of rows in the given dataframe
547 that are filled with NaN-values.
549 :param pd.DataFrame,TimeSeriesData df:
550 Given dataframe to process
551 :return: int
552 Number of NaN-Rows.
554 Example:
556 >>> import numpy as np
557 >>> import pandas as pd
558 >>> dim = np.random.randint(100) + 10
559 >>> nan_col = [np.NaN for i in range(dim)]
560 >>> col = [i for i in range(dim)]
561 >>> df_nan = pd.DataFrame({"col_1":nan_col, "col_2":nan_col})
562 >>> df_normal = pd.DataFrame({"col_1":nan_col, "col_2":col})
563 >>> print(number_lines_totally_na(df_nan)-dim)
564 0
565 >>> print(number_lines_totally_na(df_normal))
566 0
567 """
568 if not isinstance(df, pd.DataFrame):
569 raise TypeError('Input must be a pandas data frame')
570 counter = 0
571 for _, row in df.iterrows():
572 # Check if the whole row is filled with NaNs.
573 if all(row.isnull()):
574 counter += 1
575 return counter
578def z_score(x: np.ndarray, limit=3) -> np.ndarray:
579 """
580 Calculate the z-score using the mea
581 and standard deviation of the given data.
583 :param np.array x:
584 For dataframe e.g. df['a_col_name'].values
585 :param float limit: default 3
586 Lower limit for required z-score
587 :return: np.array iqr:
588 modified z score
590 Example:
592 >>> import numpy as np
593 >>> normal_dis = np.random.normal(0, 1, 1000)
594 >>> res = z_score(normal_dis, limit=2)
595 >>> values = normal_dis[res]
597 """
598 mean = np.mean(x)
599 standard_deviation = np.std(x)
600 z_score_value = (x - mean) / standard_deviation
601 return np.where(np.abs(z_score_value) > limit)[0]
604def modified_z_score(x: np.ndarray, limit: float = 3.5) -> np.ndarray:
605 """
606 Calculate the modified z-score using the median
607 and median average deviation of the given data.
609 :param np.array x:
610 For dataframe e.g. df['a_col_name'].values
611 :param float limit: default 3.5
612 Lower limit for required z-score
613 :return: np.array iqr:
614 modified z score
616 Example:
618 >>> import numpy as np
619 >>> normal_dis = np.random.normal(0, 1, 1000)
620 >>> res = modified_z_score(normal_dis, limit=2)
621 >>> values = normal_dis[res]
623 """
624 median = np.median(x)
625 median_average_deviation = np.median(np.abs(x - median))
626 z_score_mod = 0.6745 * (x - median) / median_average_deviation
627 return np.where(np.abs(z_score_mod) > limit)[0]
630def interquartile_range(x: np.ndarray) -> np.ndarray:
631 """
632 Calculate interquartile range of given array.
633 Returns the indices of values outside of the interquartile range.
635 :param np.array x:
636 For dataframe e.g. df['a_col_name'].values
637 :return: np.array iqr:
638 Array matching the interquartile-range
640 Example:
642 >>> import numpy as np
643 >>> normal_dis = np.random.normal(0, 1, 1000)
644 >>> res = interquartile_range(normal_dis)
645 >>> values = normal_dis[res]
647 """
648 quartile_1, quartile_3 = np.percentile(x, [25, 75])
649 iqr = quartile_3 - quartile_1
650 lower = quartile_1 - (iqr * 1.5)
651 upper = quartile_3 + (iqr * 1.5)
652 return np.where((x > upper) | (x < lower))[0]
655def cross_validation(x, y, test_size=0.3):
656 """
657 Split data set randomly with test_size
658 (if test_size = 0.30 --> 70 % are training data).
659 You can use this function for segmentation tasks.
660 Time-series-data may not be splitted with this function
661 as the results are not coherent (time-wise).
663 :param x:
664 Indexables with same length / shape[0] as y.
665 Allowed inputs are lists, numpy arrays, scipy-sparse
666 matrices or pandas dataframes.
667 :param list,np.ndarray,pd.DataFrame y:
668 Indexables with same length / shape[0] as x.
669 Allowed inputs are lists, numpy arrays, scipy-sparse
670 matrices or pandas dataframes.
671 :param float test_size:
672 Value between 0 and 1 specifying what percentage of the data
673 will be used for testing.
674 :return: list
675 Split data into 4 objects. The order is:
676 x_train, x_test, y_train, y_test
678 Example:
680 >>> import numpy as np
681 >>> x = np.random.rand(100)
682 >>> y = np.random.rand(100)
683 >>> ret = cross_validation(x, y)
684 >>> len(ret)
685 4
686 """
687 return model_selection.train_test_split(x, y, test_size=test_size)
690def get_df_index_frequency_mean_and_std(df_index: pd.Index, verbose: bool = False):
691 """
692 Function to get the mean and std of the index-frequency.
693 If the index is a DatetimeIndex, the seconds are converted from nanoseconds
694 to seconds.
695 Else, seconds are assumed as values.
697 :param pd.Index df_index:
698 Time index.
699 :param bool verbose:
700 Default false. If true, additional to the mean value and standard deviation,
701 the standard error of the mean and number of time steps are returned.
703 :returns:
704 float: Mean value
705 float: Standard deviation
706 """
708 if isinstance(df_index, pd.DatetimeIndex):
709 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) * 1e-9
710 else:
711 index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64)
712 if verbose:
713 return np.mean(index_in_s), np.std(index_in_s), st.sem(index_in_s), len(index_in_s)
714 else:
715 return np.mean(index_in_s), np.std(index_in_s)