Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7d23f87
added test case `def test_raises_non_fitted_error_when_error_during_f…
direkkakkar319-ops Mar 21, 2026
e1439cb
fixing test case test_style
direkkakkar319-ops Mar 21, 2026
f33a85a
fixing test case test_style
direkkakkar319-ops Mar 21, 2026
7a0e8f5
fixing test case test_style
direkkakkar319-ops Mar 21, 2026
59c8bd1
fix: defer trailing underscore attribute assignment in fit() for impu…
direkkakkar319-ops Mar 22, 2026
fef5a3f
base transformers
direkkakkar319-ops Mar 26, 2026
2d3f734
discretisation
direkkakkar319-ops Mar 26, 2026
f95cb7f
scaling
direkkakkar319-ops Mar 26, 2026
9b2fa4c
creation
direkkakkar319-ops Mar 26, 2026
64c9e74
imputation
direkkakkar319-ops Mar 26, 2026
50075fb
transformation
direkkakkar319-ops Mar 26, 2026
f2e944f
tests
direkkakkar319-ops Mar 26, 2026
ca9241e
creation
direkkakkar319-ops Mar 26, 2026
d813e2d
verified changes for checks
direkkakkar319-ops Mar 26, 2026
519ca1d
verified changes for checks
direkkakkar319-ops Mar 26, 2026
44f75c0
value error
direkkakkar319-ops Mar 26, 2026
7b3c592
ADDED:`test_raises_non_fitted_error_when_error_during_fit`
direkkakkar319-ops Mar 27, 2026
6babea2
added:`test_raises_non_fitted_error_when_error_during_fit`
direkkakkar319-ops Mar 27, 2026
c945dee
addEd:`test_raises_non_fitted_error_when_error_during_fit`
direkkakkar319-ops Mar 27, 2026
b631f63
tranformers
direkkakkar319-ops Mar 27, 2026
0c681f5
left
direkkakkar319-ops Mar 27, 2026
6898a3d
Updated the `Decisiontreefeatures` and `GeoDIstanceFeatures`
direkkakkar319-ops Mar 28, 2026
b807ef4
fixed `geo_features.py`
direkkakkar319-ops Mar 28, 2026
3de9968
Improved failure triggers
direkkakkar319-ops Mar 28, 2026
9a5973d
Improved failure triggers
direkkakkar319-ops Mar 28, 2026
aa75e97
fixed
direkkakkar319-ops Mar 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions feature_engine/_base_transformers/base_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,26 @@ class BaseNumericalTransformer(
variable transformers, discretisers, math combination.
"""

def _fit_setup(self, X: pd.DataFrame):
"""
Check dataframe, find numerical variables, check for NA and Inf.
Returns the checked dataframe and the correctly identified numerical variables.
"""
# check input dataframe
X = check_X(X)

# find or check for numerical variables
if self.variables is None:
variables_ = find_numerical_variables(X)
else:
variables_ = check_numerical_variables(X, self.variables)

# check if dataset contains na or inf
_check_contains_na(X, variables_)
_check_contains_inf(X, variables_)

return X, variables_

def fit(self, X: pd.DataFrame) -> pd.DataFrame:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see that the logic that is here and we were calling with super().fit(), you are now not using and passing it on to the transformer, which makes sense for the requested change.

So i think we should remove the fit method from base numerical altogether. Like this, we ensure we don't use it as legacy anywhere else in the source code.

"""
Checks that input is a dataframe, finds numerical variables, or alternatively
Expand Down Expand Up @@ -55,18 +75,9 @@ def fit(self, X: pd.DataFrame) -> pd.DataFrame:
The same dataframe entered as parameter
"""

# check input dataframe
X = check_X(X)

# find or check for numerical variables
if self.variables is None:
self.variables_ = find_numerical_variables(X)
else:
self.variables_ = check_numerical_variables(X, self.variables)
X, variables_ = self._fit_setup(X)

# check if dataset contains na or inf
_check_contains_na(X, self.variables_)
_check_contains_inf(X, self.variables_)
self.variables_ = variables_

# save input features
self.feature_names_in_ = X.columns.tolist()
Expand Down
12 changes: 8 additions & 4 deletions feature_engine/discretisation/equal_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,21 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""

# check input dataframe
X = super().fit(X)
X, variables_ = self._fit_setup(X)

self.binner_dict_ = {}
binner_dict_ = {}

for var in self.variables_:
for var in variables_:
tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates="drop")

# Prepend/Append infinities to accommodate outliers
bins = list(bins)
bins[0] = float("-inf")
bins[len(bins) - 1] = float("inf")
self.binner_dict_[var] = bins
binner_dict_[var] = bins

self.binner_dict_ = binner_dict_
self.variables_ = variables_
self.feature_names_in_ = X.columns.tolist()
self.n_features_in_ = X.shape[1]
return self
12 changes: 8 additions & 4 deletions feature_engine/discretisation/equal_width.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""

# check input dataframe
X = super().fit(X)
X, variables_ = self._fit_setup(X)

# fit
self.binner_dict_ = {}
binner_dict_ = {}

for var in self.variables_:
for var in variables_:
tmp, bins = pd.cut(
x=X[var],
bins=self.bins,
Expand All @@ -186,6 +186,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
bins = list(bins)
bins[0] = float("-inf")
bins[len(bins) - 1] = float("inf")
self.binner_dict_[var] = bins
binner_dict_[var] = bins

self.binner_dict_ = binner_dict_
self.variables_ = variables_
self.feature_names_in_ = X.columns.tolist()
self.n_features_in_ = X.shape[1]
return self
12 changes: 7 additions & 5 deletions feature_engine/imputation/arbitrary_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,17 +149,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
# find or check for numerical variables
# create the imputer dictionary
if self.imputer_dict:
self.variables_ = check_numerical_variables(
variables_ = check_numerical_variables(
X, list(self.imputer_dict.keys())
)
self.imputer_dict_ = self.imputer_dict
imputer_dict_ = self.imputer_dict
else:
if self.variables is None:
self.variables_ = find_numerical_variables(X)
variables_ = find_numerical_variables(X)
else:
self.variables_ = check_numerical_variables(X, self.variables)
self.imputer_dict_ = {var: self.arbitrary_number for var in self.variables_}
variables_ = check_numerical_variables(X, self.variables)
imputer_dict_ = {var: self.arbitrary_number for var in variables_}

self.variables_ = variables_
self.imputer_dict_ = imputer_dict_
self._get_feature_names_in(X)

return self
26 changes: 14 additions & 12 deletions feature_engine/imputation/end_tail.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,35 +177,37 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):

# find or check for numerical variables
if self.variables is None:
self.variables_ = find_numerical_variables(X)
variables_ = find_numerical_variables(X)
else:
self.variables_ = check_numerical_variables(X, self.variables)
variables_ = check_numerical_variables(X, self.variables)

# estimate imputation values
if self.imputation_method == "max":
self.imputer_dict_ = (X[self.variables_].max() * self.fold).to_dict()
imputer_dict_ = (X[variables_].max() * self.fold).to_dict()

elif self.imputation_method == "gaussian":
if self.tail == "right":
self.imputer_dict_ = (
X[self.variables_].mean() + self.fold * X[self.variables_].std()
imputer_dict_ = (
X[variables_].mean() + self.fold * X[variables_].std()
).to_dict()
elif self.tail == "left":
self.imputer_dict_ = (
X[self.variables_].mean() - self.fold * X[self.variables_].std()
imputer_dict_ = (
X[variables_].mean() - self.fold * X[variables_].std()
).to_dict()

elif self.imputation_method == "iqr":
IQR = X[self.variables_].quantile(0.75) - X[self.variables_].quantile(0.25)
IQR = X[variables_].quantile(0.75) - X[variables_].quantile(0.25)
if self.tail == "right":
self.imputer_dict_ = (
X[self.variables_].quantile(0.75) + (IQR * self.fold)
imputer_dict_ = (
X[variables_].quantile(0.75) + (IQR * self.fold)
).to_dict()
elif self.tail == "left":
self.imputer_dict_ = (
X[self.variables_].quantile(0.25) - (IQR * self.fold)
imputer_dict_ = (
X[variables_].quantile(0.25) - (IQR * self.fold)
).to_dict()

self.variables_ = variables_
self.imputer_dict_ = imputer_dict_
self._get_feature_names_in(X)

return self
10 changes: 6 additions & 4 deletions feature_engine/imputation/mean_median.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):

# find or check for numerical variables
if self.variables is None:
self.variables_ = find_numerical_variables(X)
variables_ = find_numerical_variables(X)
else:
self.variables_ = check_numerical_variables(X, self.variables)
variables_ = check_numerical_variables(X, self.variables)

# find imputation parameters: mean or median
if self.imputation_method == "mean":
self.imputer_dict_ = X[self.variables_].mean().to_dict()
imputer_dict_ = X[variables_].mean().to_dict()

elif self.imputation_method == "median":
self.imputer_dict_ = X[self.variables_].median().to_dict()
imputer_dict_ = X[variables_].median().to_dict()

self.variables_ = variables_
self.imputer_dict_ = imputer_dict_
self._get_feature_names_in(X)

return self
28 changes: 28 additions & 0 deletions tests/test_imputation/test_check_estimator_imputers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from sklearn import clone
from sklearn.exceptions import NotFittedError

import pandas as pd
import pytest
import sklearn
Expand Down Expand Up @@ -69,3 +72,28 @@ def test_transformers_in_pipeline_with_set_output_pandas(transformer):
Xtp = pipe.fit_transform(X, y)

pd.testing.assert_frame_equal(Xtt, Xtp)


@pytest.mark.parametrize("estimator", _estimators)
def test_raises_non_fitted_error_when_error_during_fit(estimator):
estimator = clone(estimator)

if estimator.__class__.__name__ in [
"MeanMedianImputer",
"EndTailImputer",
"ArbitraryNumberImputer",
]:
X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]})

elif estimator.__class__.__name__ == "CategoricalImputer":
estimator.set_params(ignore_format=False)
X = pd.DataFrame({"num1": [1.0, 2.0, 3.0, 4.0, 5.0]})

else:
X = pd.DataFrame()

with pytest.raises((ValueError, TypeError)):
estimator.fit(X)

with pytest.raises(NotFittedError):
estimator.transform(X)