Coverage for addmo/s3_model_tuning/scoring/validation_splitting/custom

1from addmo.s3_model_tuning.scoring.validation_splitting.abstract_splitter import (

2 AbstractSplitter,

5"""Creating custom splitter that work with scikit-learn. Please see the documentation of the

6AbstractSplitter class for more information."""

9class TrialCustomSplitter(AbstractSplitter):

10 """Custom splitter for scikit-learn cross-validation.

12 This splitter creates two folds:

13 - The first fold includes the first and last 10 rows of the dataset.

14 - The second fold includes the first 100 rows.

16 Both folds are used once as test set and once as train set (due to cross-validation).

18 This splitter is only for demonstration purposes and should not be used in production.

19 """

21 def get_n_splits(self, X=None, y=None, groups=None):

22 """Return the number of splitting iterations in the cross-validator."""

23 return 2

25 def _iter_test_indices(self, X, y=None, groups=None):

26 """Generate integer indices for test set for each fold."""

27 n_samples = len(X)

28 if n_samples < 20:

29 raise ValueError(

30 "The dataset must have at least 20 samples for this custom splitter."

31 )

33 first_fold_indices = list(range(10)) + list(range(n_samples - 10, n_samples))

34 yield first_fold_indices

36 second_fold_indices = list(range(0, 100))

37 yield second_fold_indices

38 # if you only yield one fold the cross-validation will only produce one score on the

39 # yielded test indices. For each yielded fold, e.g. through a for loop, the cross-validation

40 # will produce one score on the yielded test indices.

43class UnivariateSplitter(AbstractSplitter):

44 """

45 This class inherits from `AbstractSplitter` and is designed to split datasets along a single

46 feature dimension based on predefined ratios. Creates one split for each feature in the

47 dataset, where each test set is composed of system_data points from the top, bottom, and middle

48 sections of the sorted feature values.

49 """

51 def __init__(

52 self, top_split_ratio=0.1, bottom_split_ratio=0.1, middle_split_ratio=0.1

53 ):

54 # Ensure the sum of provided ratios does not exceed 1

55 total_ratio = sum(

56 filter(None, [top_split_ratio, bottom_split_ratio, middle_split_ratio])

57 )

58 if total_ratio > 1:

59 raise ValueError("The sum of all split ratios must not exceed 1.")

61 self.top_split_ratio = top_split_ratio

62 self.bottom_split_ratio = bottom_split_ratio

63 self.middle_split_ratio = middle_split_ratio

65 def get_n_splits(self, X, y=None, groups=None):

66 if X is None:

67 raise ValueError("The 'X' parameter should not be None.")

68 return X.shape[1]

70 def _iter_test_indices(self, X, y=None, groups=None):

71 for feature_index in range(X.shape[1]):

72 # Sort once for each feature

73 sorted_indices = X.iloc[:, feature_index].sort_values(ascending=True).index

75 test_indices = []

76 if self.top_split_ratio:

77 num_top_tests = int(X.shape[0] * self.top_split_ratio)

78 test_indices.extend(sorted_indices[-num_top_tests:])

80 if self.bottom_split_ratio:

81 num_bottom_tests = int(X.shape[0] * self.bottom_split_ratio)

82 test_indices.extend(sorted_indices[:num_bottom_tests])

84 if self.middle_split_ratio:

85 total_middle_tests = int(X.shape[0] * self.middle_split_ratio)

86 start_index = (X.shape[0] - total_middle_tests) // 2

87 end_index = start_index + total_middle_tests

88 test_indices.extend(sorted_indices[start_index:end_index])

90 # Ensure unique indices in case of overlap

91 unique_test_indices = list(set(test_indices))

93 test_indices_positions = [

94 X.index.get_loc(idx) for idx in unique_test_indices

95 ]

96 yield test_indices_positions

Coverage for addmo/s3_model_tuning/scoring/validation_splitting/custom_splitters.py: 21%

43 statements