Coverage for addmo/s3_model_tuning/scoring/validation_splitting/custom_splitters.py: 21%

43 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-08-31 13:05 +0000

1from addmo.s3_model_tuning.scoring.validation_splitting.abstract_splitter import ( 

2 AbstractSplitter, 

3) 

4 

5"""Creating custom splitter that work with scikit-learn. Please see the documentation of the  

6AbstractSplitter class for more information.""" 

7 

8 

9class TrialCustomSplitter(AbstractSplitter): 

10 """Custom splitter for scikit-learn cross-validation. 

11 

12 This splitter creates two folds: 

13 - The first fold includes the first and last 10 rows of the dataset. 

14 - The second fold includes the first 100 rows. 

15 

16 Both folds are used once as test set and once as train set (due to cross-validation). 

17 

18 This splitter is only for demonstration purposes and should not be used in production. 

19 """ 

20 

21 def get_n_splits(self, X=None, y=None, groups=None): 

22 """Return the number of splitting iterations in the cross-validator.""" 

23 return 2 

24 

25 def _iter_test_indices(self, X, y=None, groups=None): 

26 """Generate integer indices for test set for each fold.""" 

27 n_samples = len(X) 

28 if n_samples < 20: 

29 raise ValueError( 

30 "The dataset must have at least 20 samples for this custom splitter." 

31 ) 

32 

33 first_fold_indices = list(range(10)) + list(range(n_samples - 10, n_samples)) 

34 yield first_fold_indices 

35 

36 second_fold_indices = list(range(0, 100)) 

37 yield second_fold_indices 

38 # if you only yield one fold the cross-validation will only produce one score on the 

39 # yielded test indices. For each yielded fold, e.g. through a for loop, the cross-validation 

40 # will produce one score on the yielded test indices. 

41 

42 

43class UnivariateSplitter(AbstractSplitter): 

44 """ 

45 This class inherits from `AbstractSplitter` and is designed to split datasets along a single 

46 feature dimension based on predefined ratios. Creates one split for each feature in the 

47 dataset, where each test set is composed of system_data points from the top, bottom, and middle 

48 sections of the sorted feature values. 

49 """ 

50 

51 def __init__( 

52 self, top_split_ratio=0.1, bottom_split_ratio=0.1, middle_split_ratio=0.1 

53 ): 

54 # Ensure the sum of provided ratios does not exceed 1 

55 total_ratio = sum( 

56 filter(None, [top_split_ratio, bottom_split_ratio, middle_split_ratio]) 

57 ) 

58 if total_ratio > 1: 

59 raise ValueError("The sum of all split ratios must not exceed 1.") 

60 

61 self.top_split_ratio = top_split_ratio 

62 self.bottom_split_ratio = bottom_split_ratio 

63 self.middle_split_ratio = middle_split_ratio 

64 

65 def get_n_splits(self, X, y=None, groups=None): 

66 if X is None: 

67 raise ValueError("The 'X' parameter should not be None.") 

68 return X.shape[1] 

69 

70 def _iter_test_indices(self, X, y=None, groups=None): 

71 for feature_index in range(X.shape[1]): 

72 # Sort once for each feature 

73 sorted_indices = X.iloc[:, feature_index].sort_values(ascending=True).index 

74 

75 test_indices = [] 

76 if self.top_split_ratio: 

77 num_top_tests = int(X.shape[0] * self.top_split_ratio) 

78 test_indices.extend(sorted_indices[-num_top_tests:]) 

79 

80 if self.bottom_split_ratio: 

81 num_bottom_tests = int(X.shape[0] * self.bottom_split_ratio) 

82 test_indices.extend(sorted_indices[:num_bottom_tests]) 

83 

84 if self.middle_split_ratio: 

85 total_middle_tests = int(X.shape[0] * self.middle_split_ratio) 

86 start_index = (X.shape[0] - total_middle_tests) // 2 

87 end_index = start_index + total_middle_tests 

88 test_indices.extend(sorted_indices[start_index:end_index]) 

89 

90 # Ensure unique indices in case of overlap 

91 unique_test_indices = list(set(test_indices)) 

92 

93 test_indices_positions = [ 

94 X.index.get_loc(idx) for idx in unique_test_indices 

95 ] 

96 yield test_indices_positions