Coverage for aixweather/transformation_functions/auxiliary.py: 98%

41 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-01-06 16:01 +0000

1""" 

2includes auxiliary functions for data handling and transformation 

3""" 

4import logging 

5import pandas as pd 

6import numpy as np 

7 

8logger = logging.getLogger(__name__) 

9 

10 

11def force_data_variable_convention( 

12 df: pd.DataFrame, format_desired: dict 

13) -> pd.DataFrame: 

14 """ 

15 Ensure that all and only desired variable names are present and in correct order. 

16 

17 Args: 

18 df (pd.DataFrame): The DataFrame containing the data to be formatted. 

19 format_desired (dict): A dictionary specifying the desired format with variable names as keys. 

20 

21 Returns: 

22 pd.DataFrame: A DataFrame with filtered data in the desired format and order. 

23 """ 

24 

25 # filter existing df 

26 desired_var_names = set(format_desired.keys()) 

27 df_core_var = df.loc[:, df.columns.isin(desired_var_names)] 

28 

29 # Reindex the DataFrame to ensure all required columns exist and obey the order of columns 

30 df_core_var = df_core_var.reindex(columns=format_desired.keys()) 

31 

32 return df_core_var 

33 

34 

35def rename_columns(df: pd.DataFrame, format_dict: dict) -> pd.DataFrame: 

36 """ 

37 Rename DataFrame columns based on the provided format dictionary. 

38 

39 Args: 

40 df (pd.DataFrame): The DataFrame whose columns need to be renamed. 

41 format_dict (dict): A dictionary specifying the column renaming mapping, 

42 with current column names as keys and desired names as values. 

43 

44 Returns: 

45 pd.DataFrame: A DataFrame with renamed columns. 

46 """ 

47 rename_map = {key: val["core_name"] for key, val in format_dict.items()} 

48 return df.rename(columns=rename_map) 

49 

50 

51def fill_nan_from_format_dict(df: pd.DataFrame, format_data: dict) -> pd.DataFrame: 

52 """ 

53 Fill NaN values in a DataFrame based on the provided format data. 

54 

55 Args: 

56 df (pd.DataFrame): The DataFrame containing the data to be processed. 

57 format_data (dict): A dictionary specifying NaN replacement values for columns, 

58 with column names as keys and NaN replacement values as values. 

59 

60 Returns: 

61 pd.DataFrame: A DataFrame with NaN values filled as per the format data. 

62 """ 

63 nan_key = "nan" 

64 for key, value in format_data.items(): 

65 nan = value[nan_key] 

66 if nan is not None: 

67 df[key].fillna(nan, inplace=True) 

68 return df 

69 

70 

71def replace_dummy_with_nan(df: pd.DataFrame, format_dict: dict) -> pd.DataFrame: 

72 """ 

73 Replace specific values in the DataFrame with NaN based on the given format dictionary. 

74 Reason: sometimes, e.g. the DWD, specifies a missing value with a dummy value like e.g. 99, 

75 which makes it hard to see where missing values are and might affect the simulation. 

76 

77 Args: 

78 df (pd.DataFrame): The DataFrame to be processed. 

79 format_dict (dict): A dictionary specifying values to be replaced with NaN, 

80 with column names as keys and corresponding dummy values as values. 

81 

82 Returns: 

83 pd.DataFrame: A DataFrame with specified values replaced by NaN. 

84 """ 

85 

86 for key, value in format_dict.items(): 

87 if "nan" in value and key in df.columns: 

88 nan_values = value["nan"] 

89 if not isinstance(nan_values, list): 

90 nan_values = [nan_values] 

91 for nan_val in nan_values: 

92 df[key] = df[key].replace(nan_val, np.nan) 

93 return df 

94 

95 

96def evaluate_transformations(core_format: dict, other_format: dict): 

97 """ 

98 Compare the units and core variables of two formats and print any required unit transformations. 

99 

100 Args: 

101 core_format (dict): A dictionary representing the core format with keys as variable names and values 

102 containing unit information. 

103 other_format (dict): A dictionary representing another format to be compared with the core format. 

104 It contains keys and values with 'core_name' and 'unit' attributes. 

105 

106 Raises: 

107 ValueError: If a core variable in other_format doesn't match the core variable format. 

108 """ 

109 

110 logger.debug("Evaluate format.") 

111 for key, value in other_format.items(): 

112 if value["core_name"] in core_format.keys(): 

113 # compare units 

114 if value["unit"] != core_format[value["core_name"]]["unit"]: 

115 logger.debug( 

116 "Unit transformation required for %s from %s to %s.", 

117 value['core_name'], value['unit'], 

118 core_format[value['core_name']]['unit'] 

119 ) 

120 elif not value["core_name"]: 

121 pass 

122 else: 

123 raise ValueError( 

124 f"The core variable '{value['core_name']}' of import variable" 

125 f" {key} does not fit the core variable format" 

126 ) 

127 

128 

129def select_entry_by_core_name(format_dict: dict, core_name_to_match: str): 

130 """ 

131 Select an entry from a format dictionary based on the specified core name. 

132 

133 Args: 

134 format_dict (dict): A dictionary to search for the entry. 

135 core_name_to_match (str): The core name to match in the dictionary values. 

136 

137 Returns: 

138 dict: The dictionary entry matching the specified core name, or None if not found. 

139 """ 

140 for key, value in format_dict.items(): 

141 if value["core_name"] == core_name_to_match: 

142 return value