Coverage for addmo/s3_model_tuning/scoring/validation_splitting/custom_splitters.py: 21%
43 statements
« prev ^ index » next coverage.py v7.4.4, created at 2025-08-31 13:05 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2025-08-31 13:05 +0000
1from addmo.s3_model_tuning.scoring.validation_splitting.abstract_splitter import (
2 AbstractSplitter,
3)
5"""Creating custom splitter that work with scikit-learn. Please see the documentation of the
6AbstractSplitter class for more information."""
9class TrialCustomSplitter(AbstractSplitter):
10 """Custom splitter for scikit-learn cross-validation.
12 This splitter creates two folds:
13 - The first fold includes the first and last 10 rows of the dataset.
14 - The second fold includes the first 100 rows.
16 Both folds are used once as test set and once as train set (due to cross-validation).
18 This splitter is only for demonstration purposes and should not be used in production.
19 """
21 def get_n_splits(self, X=None, y=None, groups=None):
22 """Return the number of splitting iterations in the cross-validator."""
23 return 2
25 def _iter_test_indices(self, X, y=None, groups=None):
26 """Generate integer indices for test set for each fold."""
27 n_samples = len(X)
28 if n_samples < 20:
29 raise ValueError(
30 "The dataset must have at least 20 samples for this custom splitter."
31 )
33 first_fold_indices = list(range(10)) + list(range(n_samples - 10, n_samples))
34 yield first_fold_indices
36 second_fold_indices = list(range(0, 100))
37 yield second_fold_indices
38 # if you only yield one fold the cross-validation will only produce one score on the
39 # yielded test indices. For each yielded fold, e.g. through a for loop, the cross-validation
40 # will produce one score on the yielded test indices.
43class UnivariateSplitter(AbstractSplitter):
44 """
45 This class inherits from `AbstractSplitter` and is designed to split datasets along a single
46 feature dimension based on predefined ratios. Creates one split for each feature in the
47 dataset, where each test set is composed of system_data points from the top, bottom, and middle
48 sections of the sorted feature values.
49 """
51 def __init__(
52 self, top_split_ratio=0.1, bottom_split_ratio=0.1, middle_split_ratio=0.1
53 ):
54 # Ensure the sum of provided ratios does not exceed 1
55 total_ratio = sum(
56 filter(None, [top_split_ratio, bottom_split_ratio, middle_split_ratio])
57 )
58 if total_ratio > 1:
59 raise ValueError("The sum of all split ratios must not exceed 1.")
61 self.top_split_ratio = top_split_ratio
62 self.bottom_split_ratio = bottom_split_ratio
63 self.middle_split_ratio = middle_split_ratio
65 def get_n_splits(self, X, y=None, groups=None):
66 if X is None:
67 raise ValueError("The 'X' parameter should not be None.")
68 return X.shape[1]
70 def _iter_test_indices(self, X, y=None, groups=None):
71 for feature_index in range(X.shape[1]):
72 # Sort once for each feature
73 sorted_indices = X.iloc[:, feature_index].sort_values(ascending=True).index
75 test_indices = []
76 if self.top_split_ratio:
77 num_top_tests = int(X.shape[0] * self.top_split_ratio)
78 test_indices.extend(sorted_indices[-num_top_tests:])
80 if self.bottom_split_ratio:
81 num_bottom_tests = int(X.shape[0] * self.bottom_split_ratio)
82 test_indices.extend(sorted_indices[:num_bottom_tests])
84 if self.middle_split_ratio:
85 total_middle_tests = int(X.shape[0] * self.middle_split_ratio)
86 start_index = (X.shape[0] - total_middle_tests) // 2
87 end_index = start_index + total_middle_tests
88 test_indices.extend(sorted_indices[start_index:end_index])
90 # Ensure unique indices in case of overlap
91 unique_test_indices = list(set(test_indices))
93 test_indices_positions = [
94 X.index.get_loc(idx) for idx in unique_test_indices
95 ]
96 yield test_indices_positions