Coverage for aixweather/transformation_functions/auxiliary.py: 87%
54 statements
« prev ^ index » next coverage.py v7.4.4, created at 2025-03-04 14:19 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2025-03-04 14:19 +0000
1"""
2includes auxiliary functions for data handling and transformation
3"""
4import logging
5import pandas as pd
6import numpy as np
8logger = logging.getLogger(__name__)
11def force_data_variable_convention(
12 df: pd.DataFrame, format_desired: dict
13) -> pd.DataFrame:
14 """
15 Ensure that all and only desired variable names are present and in correct order.
17 Args:
18 df (pd.DataFrame): The DataFrame containing the data to be formatted.
19 format_desired (dict): A dictionary specifying the desired format with variable names as keys.
21 Returns:
22 pd.DataFrame: A DataFrame with filtered data in the desired format and order.
23 """
25 # filter existing df
26 desired_var_names = set(format_desired.keys())
27 df_core_var = df.loc[:, df.columns.isin(desired_var_names)]
29 # Reindex the DataFrame to ensure all required columns exist and obey the order of columns
30 df_core_var = df_core_var.reindex(columns=format_desired.keys())
32 return df_core_var
35def rename_columns(df: pd.DataFrame, format_dict: dict) -> pd.DataFrame:
36 """
37 Rename DataFrame columns based on the provided format dictionary.
39 Args:
40 df (pd.DataFrame): The DataFrame whose columns need to be renamed.
41 format_dict (dict): A dictionary specifying the column renaming mapping,
42 with current column names as keys and desired names as values.
44 Returns:
45 pd.DataFrame: A DataFrame with renamed columns.
46 """
47 rename_map = {key: val["core_name"] for key, val in format_dict.items()}
48 return df.rename(columns=rename_map)
51def fill_nan_from_format_dict(df: pd.DataFrame, format_data: dict) -> pd.DataFrame:
52 """
53 Fill NaN values in a DataFrame based on the provided format data.
55 Args:
56 df (pd.DataFrame): The DataFrame containing the data to be processed.
57 format_data (dict): A dictionary specifying NaN replacement values for columns,
58 with column names as keys and NaN replacement values as values.
60 Returns:
61 pd.DataFrame: A DataFrame with NaN values filled as per the format data.
62 """
63 nan_key = "nan"
64 for key, value in format_data.items():
65 nan = value[nan_key]
66 if nan is not None:
67 df[key].fillna(nan, inplace=True)
68 return df
71def replace_dummy_with_nan(df: pd.DataFrame, format_dict: dict) -> pd.DataFrame:
72 """
73 Replace specific values, or value ranges, in the DataFrame with NaN based on the given format
74 dictionary. Reason: sometimes, e.g. the DWD, specifies a missing value with a dummy value
75 like e.g. 99, which makes it hard to see where missing values are and might affect the
76 simulation.
78 Args:
79 df (pd.DataFrame): The DataFrame to be processed.
80 format_dict (dict): A dictionary specifying values to be replaced with NaN,
81 with column names as keys and corresponding dummy values as values.
82 Exact nans given through a float or int.
83 Value ranges given through a dictionary with the operator as key
84 and the threshold as value, e. g. {'<': 0}.
86 Returns:
87 pd.DataFrame: A DataFrame with specified values replaced by NaN.
88 """
90 for key, value in format_dict.items():
91 if "nan" in value and key in df.columns:
92 nan_values = value["nan"]
93 if not isinstance(nan_values, list):
94 nan_values = [nan_values]
95 for nan_val in nan_values:
96 # replace specified dummy values with NaN
97 if not isinstance(nan_val, dict):
98 df[key] = df[key].replace(nan_val, np.nan)
99 # replace specified value range with NaN
100 else:
101 operator, threshold = list(nan_val.items())[0]
103 if operator == '<':
104 df.loc[df[key] < threshold, key] = np.nan
105 elif operator == '<=':
106 df.loc[df[key] <= threshold, key] = np.nan
107 elif operator == '>':
108 df.loc[df[key] > threshold, key] = np.nan
109 elif operator == '>=':
110 df.loc[df[key] >= threshold, key] = np.nan
111 elif operator == '==':
112 df.loc[df[key] == threshold, key] = np.nan
113 else:
114 raise ValueError(f"Unsupported operator: {operator}")
116 return df
119def evaluate_transformations(core_format: dict, other_format: dict):
120 """
121 Compare the units and core variables of two formats and print any required unit transformations.
123 Args:
124 core_format (dict): A dictionary representing the core format with keys as variable names and values
125 containing unit information.
126 other_format (dict): A dictionary representing another format to be compared with the core format.
127 It contains keys and values with 'core_name' and 'unit' attributes.
129 Raises:
130 ValueError: If a core variable in other_format doesn't match the core variable format.
131 """
133 logger.debug("Evaluate format.")
134 for key, value in other_format.items():
135 if value["core_name"] in core_format.keys():
136 # compare units
137 if value["unit"] != core_format[value["core_name"]]["unit"]:
138 logger.debug(
139 "Unit transformation required for %s from %s to %s.",
140 value['core_name'], value['unit'],
141 core_format[value['core_name']]['unit']
142 )
143 elif not value["core_name"]:
144 pass
145 else:
146 raise ValueError(
147 f"The core variable '{value['core_name']}' of import variable"
148 f" {key} does not fit the core variable format"
149 )
152def select_entry_by_core_name(format_dict: dict, core_name_to_match: str):
153 """
154 Select an entry from a format dictionary based on the specified core name.
156 Args:
157 format_dict (dict): A dictionary to search for the entry.
158 core_name_to_match (str): The core name to match in the dictionary values.
160 Returns:
161 dict: The dictionary entry matching the specified core name, or None if not found.
162 """
163 for key, value in format_dict.items():
164 if value["core_name"] == core_name_to_match:
165 return value