Coverage for agentlib_flexquant/utils/data_handling.py: 75%

44 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-08-15 15:25 +0000

1import pandas as pd 

2from typing import Literal 

3from agentlib_mpc.utils import TimeConversionTypes, TIME_CONVERSION 

4 

5 

6MEAN: str = "mean" 

7INTERPOLATE: str = "interpolate" 

8FillNansMethods = Literal[MEAN, INTERPOLATE] 

9 

10 

11def fill_nans(series: pd.Series, method: FillNansMethods) -> pd.Series: 

12 """Fill NaN values in the series with the given method. 

13 

14 Args: 

15 series: the series to be filled 

16 method: the method to be applied, there are two predefined 

17 - mean: fill NaN values with the mean of the following values. 

18 - interpolate: interpolate missing values. 

19 

20 Returns: 

21 A pd.Series with nan filled. 

22 

23 """ 

24 if method == MEAN: 

25 series = _set_mean_values(series=series) 

26 elif method == INTERPOLATE: 

27 # Interpolate missing values 

28 series = series.interpolate(method="index", limit_direction="both") 

29 

30 if series.isna().any(): 

31 raise ValueError(f"NaN values are still present in the series after filling them with the method {method}\n{series}") 

32 return series 

33 

34 

35def _set_mean_values(series: pd.Series) -> pd.Series: 

36 """Fill intervals including the nan with the mean of the following values before the next nan.""" 

37 def _get_intervals_for_mean(s: pd.Series) -> list[pd.Interval]: 

38 intervals = [] 

39 start = None 

40 for index, value in s.items(): 

41 if pd.isna(value): 

42 if pd.isna(start): 

43 start = index 

44 else: 

45 end = index 

46 intervals.append(pd.Interval(left=start, right=end, closed="left")) 

47 start = end 

48 return intervals 

49 

50 for interval in _get_intervals_for_mean(series): 

51 interval_index = (interval.left <= series.index) & (series.index < interval.right) 

52 series[interval.left] = series[interval_index].mean(skipna=True) 

53 

54 # remove last entry if nan, e.g. with collocation 

55 if pd.isna(series.iloc[-1]): 

56 series = series.iloc[:-1] 

57 

58 return series 

59 

60 

61def strip_multi_index(series: pd.Series) -> pd.Series: 

62 # Convert the index (communicated as string) into a MultiIndex 

63 if isinstance(series.index[0], str): 

64 series.index = series.index.map(lambda x: eval(x)) 

65 series.index = pd.MultiIndex.from_tuples(series.index) 

66 # vals is multicolumn so get rid of first value (start time of predictions) 

67 series.index = series.index.get_level_values(1).astype(float) 

68 return series 

69 

70 

71def convert_timescale_of_index(df: pd.DataFrame, from_unit: TimeConversionTypes, to_unit: TIME_CONVERSION) -> pd.DataFrame: 

72 """Convert the timescale of a dataframe index (from seconds) to the given time unit. 

73 

74 Args: 

75 from_unit: the time unit of the original index 

76 to_unit: the time unit to convert the index to 

77 

78 Returns: 

79 A DataFrame with the converted index 

80 

81 """ 

82 time_conversion_factor = TIME_CONVERSION[from_unit] / TIME_CONVERSION[to_unit] 

83 if isinstance(df.index, pd.MultiIndex): 

84 df.index = pd.MultiIndex.from_arrays( 

85 [df.index.get_level_values(level) * time_conversion_factor for level in range(df.index.nlevels)] 

86 ) 

87 else: 

88 df.index = df.index * time_conversion_factor 

89 return df