'Why is target encoder encoding some values as NaN?
I am using a target encoder from category_encoders to encode a feature, here is the code I m using:
from category_encoders import TargetEncoder
def encode_large_features(features, X_train, X_test, y_train):
print('target encoding features ...')
for _ in features:
target_encoder = TargetEncoder(_)
target_encoder.fit(X_train[_], y_train)
name = _ + '_encoded'
X_train[name] = target_encoder.transform(X_train[_])
X_train.drop([_], axis=1, inplace=True)
X_test[name] = target_encoder.transform(X_test[_])
X_test.drop([_], axis=1, inplace=True)
return X_train, X_test
the target encoder encodes some values as NaN and I dont know why? here is an example:
Solution 1:[1]
Faced the same issue: Raised Issue n Repo
Found a workaround by Building a Custom KFold-Target Encoder which is better than the library version. KFold Target Encoder is less susceptible to data leakage / fewer chances of overfitting.
This will not return NaN in the training Dataset like category_encoder library.
Below example: chid is a categorical column apply KFoldTargetEncoder on it.
Libraries required:
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn import base
Training Dataset:
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):
self.colnames = colnames
self.targetName = targetName
self.n_fold = n_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self, X, y=None):
return self
def transform(self,X):
assert(type(self.targetName) == str)
assert(type(self.colnames) == str)
assert(self.colnames in X.columns)
assert(self.targetName in X.columns)
mean_of_target = X[self.targetName].mean()
kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019)
col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
X[col_mean_name] = np.nan
for tr_ind, val_ind in kf.split(X):
X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
X[col_mean_name].fillna(mean_of_target, inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
self.targetName,
np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
Fit_Transform on Training Data:
targetc_chid = KFoldTargetEncoderTrain('chid','target',n_fold=5)
train_df = targetc_chid.fit_transform(train_df)
Test Dataset:
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
def __init__(self,train,colNames,encodedName):
self.train = train
self.colNames = colNames
self.encodedName = encodedName
def fit(self, X, y=None):
return self
def transform(self,X):
mean = self.train[[self.colNames,
self.encodedName]].groupby(
self.colNames).mean().reset_index()
dd = {}
for row in tqdm(mean.itertuples(index=False)):
dd[row[0]] = row[1]
X[self.encodedName] = X[self.colNames]
X[self.encodedName] = X[self.encodedName].map(dd.get)
return X
Fit on Test Data:
test_targetc_chid = KFoldTargetEncoderTest(train_df,'chid','chid_Kfold_Target_Enc')
valid_df = test_targetc_chid.fit_transform(valid_df)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Shaurya Uppal |

