Coverage for aixweather/transformation_functions/auxiliary.py: 87%

54 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-03-04 14:19 +0000

1""" 

2includes auxiliary functions for data handling and transformation 

3""" 

4import logging 

5import pandas as pd 

6import numpy as np 

7 

8logger = logging.getLogger(__name__) 

9 

10 

11def force_data_variable_convention( 

12 df: pd.DataFrame, format_desired: dict 

13) -> pd.DataFrame: 

14 """ 

15 Ensure that all and only desired variable names are present and in correct order. 

16 

17 Args: 

18 df (pd.DataFrame): The DataFrame containing the data to be formatted. 

19 format_desired (dict): A dictionary specifying the desired format with variable names as keys. 

20 

21 Returns: 

22 pd.DataFrame: A DataFrame with filtered data in the desired format and order. 

23 """ 

24 

25 # filter existing df 

26 desired_var_names = set(format_desired.keys()) 

27 df_core_var = df.loc[:, df.columns.isin(desired_var_names)] 

28 

29 # Reindex the DataFrame to ensure all required columns exist and obey the order of columns 

30 df_core_var = df_core_var.reindex(columns=format_desired.keys()) 

31 

32 return df_core_var 

33 

34 

35def rename_columns(df: pd.DataFrame, format_dict: dict) -> pd.DataFrame: 

36 """ 

37 Rename DataFrame columns based on the provided format dictionary. 

38 

39 Args: 

40 df (pd.DataFrame): The DataFrame whose columns need to be renamed. 

41 format_dict (dict): A dictionary specifying the column renaming mapping, 

42 with current column names as keys and desired names as values. 

43 

44 Returns: 

45 pd.DataFrame: A DataFrame with renamed columns. 

46 """ 

47 rename_map = {key: val["core_name"] for key, val in format_dict.items()} 

48 return df.rename(columns=rename_map) 

49 

50 

51def fill_nan_from_format_dict(df: pd.DataFrame, format_data: dict) -> pd.DataFrame: 

52 """ 

53 Fill NaN values in a DataFrame based on the provided format data. 

54 

55 Args: 

56 df (pd.DataFrame): The DataFrame containing the data to be processed. 

57 format_data (dict): A dictionary specifying NaN replacement values for columns, 

58 with column names as keys and NaN replacement values as values. 

59 

60 Returns: 

61 pd.DataFrame: A DataFrame with NaN values filled as per the format data. 

62 """ 

63 nan_key = "nan" 

64 for key, value in format_data.items(): 

65 nan = value[nan_key] 

66 if nan is not None: 

67 df[key].fillna(nan, inplace=True) 

68 return df 

69 

70 

71def replace_dummy_with_nan(df: pd.DataFrame, format_dict: dict) -> pd.DataFrame: 

72 """ 

73 Replace specific values, or value ranges, in the DataFrame with NaN based on the given format 

74 dictionary. Reason: sometimes, e.g. the DWD, specifies a missing value with a dummy value 

75 like e.g. 99, which makes it hard to see where missing values are and might affect the 

76 simulation. 

77 

78 Args: 

79 df (pd.DataFrame): The DataFrame to be processed. 

80 format_dict (dict): A dictionary specifying values to be replaced with NaN, 

81 with column names as keys and corresponding dummy values as values. 

82 Exact nans given through a float or int. 

83 Value ranges given through a dictionary with the operator as key 

84 and the threshold as value, e. g. {'<': 0}. 

85 

86 Returns: 

87 pd.DataFrame: A DataFrame with specified values replaced by NaN. 

88 """ 

89 

90 for key, value in format_dict.items(): 

91 if "nan" in value and key in df.columns: 

92 nan_values = value["nan"] 

93 if not isinstance(nan_values, list): 

94 nan_values = [nan_values] 

95 for nan_val in nan_values: 

96 # replace specified dummy values with NaN 

97 if not isinstance(nan_val, dict): 

98 df[key] = df[key].replace(nan_val, np.nan) 

99 # replace specified value range with NaN 

100 else: 

101 operator, threshold = list(nan_val.items())[0] 

102 

103 if operator == '<': 

104 df.loc[df[key] < threshold, key] = np.nan 

105 elif operator == '<=': 

106 df.loc[df[key] <= threshold, key] = np.nan 

107 elif operator == '>': 

108 df.loc[df[key] > threshold, key] = np.nan 

109 elif operator == '>=': 

110 df.loc[df[key] >= threshold, key] = np.nan 

111 elif operator == '==': 

112 df.loc[df[key] == threshold, key] = np.nan 

113 else: 

114 raise ValueError(f"Unsupported operator: {operator}") 

115 

116 return df 

117 

118 

119def evaluate_transformations(core_format: dict, other_format: dict): 

120 """ 

121 Compare the units and core variables of two formats and print any required unit transformations. 

122 

123 Args: 

124 core_format (dict): A dictionary representing the core format with keys as variable names and values 

125 containing unit information. 

126 other_format (dict): A dictionary representing another format to be compared with the core format. 

127 It contains keys and values with 'core_name' and 'unit' attributes. 

128 

129 Raises: 

130 ValueError: If a core variable in other_format doesn't match the core variable format. 

131 """ 

132 

133 logger.debug("Evaluate format.") 

134 for key, value in other_format.items(): 

135 if value["core_name"] in core_format.keys(): 

136 # compare units 

137 if value["unit"] != core_format[value["core_name"]]["unit"]: 

138 logger.debug( 

139 "Unit transformation required for %s from %s to %s.", 

140 value['core_name'], value['unit'], 

141 core_format[value['core_name']]['unit'] 

142 ) 

143 elif not value["core_name"]: 

144 pass 

145 else: 

146 raise ValueError( 

147 f"The core variable '{value['core_name']}' of import variable" 

148 f" {key} does not fit the core variable format" 

149 ) 

150 

151 

152def select_entry_by_core_name(format_dict: dict, core_name_to_match: str): 

153 """ 

154 Select an entry from a format dictionary based on the specified core name. 

155 

156 Args: 

157 format_dict (dict): A dictionary to search for the entry. 

158 core_name_to_match (str): The core name to match in the dictionary values. 

159 

160 Returns: 

161 dict: The dictionary entry matching the specified core name, or None if not found. 

162 """ 

163 for key, value in format_dict.items(): 

164 if value["core_name"] == core_name_to_match: 

165 return value