'Get support and ranking attributes for RFE using Pipeline in Python 3

The code I have so far is below and it works perfectly. However, I would like to print the following RFE attributes for each number of features tested: "rfe.support_[i]", "rfe.ranking_[i]" and the name of the selected features since "i" refers to the index, the first attribute returns True or False (if the columns were selected or not) and the second one returns their respective rankings.

In other words, I would like to print the columns considered in each RFE and that they do not remain as something abstract.

# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Get the dataset
def get_dataset(df, target):
    X, y = df.drop(columns = target), df[[target]].values.flatten()
    return X, y

# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
    num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
    cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
                                        ('one-hot-encoder', OneHotEncoder())])
    preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
                                                     ('cat', cat_transformer, list_cat_cols)])
    models = dict()    
    for i in range(2, 4):
        rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
        model_dtr = DecisionTreeRegressor()
        models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
                                                                     ('s_dtr', rfe_dtr), 
                                                                     ('m_dtr', model_dtr)])
    return models

# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
    scores = cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv, 
                             n_jobs = -1, error_score = 'raise')
    return scores


# Define the dataset
X, y = get_dataset(my_df, 'my_target')   # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(), 
                    X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

The following is returning errors:

models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_[0] # Returns: AttributeError: 'RFE' object has no attribute 'support_'
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_[0] # Returns: AttributeError: 'RFE' object has no attribute 'ranking_'


Solution 1:[1]

Point is that you haven't explicitly fitted the 'DecisionTreeRegressor_2' pipeline.

Indeed, though cross_val_score already takes care of fitting the estimator as you might see here, cross_val_score does not return the estimator instance, as .fit() method does. Therefore you're not able to access the RFE instance attributes.

Here's a toy example from your setting:

from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression

X, y = make_regression()
models = dict()    
for i in range(2, 4):
    rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
    model_dtr = DecisionTreeRegressor()
    models['DecisionTreeRegressor_' + str(i)] = Pipeline(
    [
        ('s_dtr', rfe_dtr), 
        ('m_dtr', model_dtr)
    ])

models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_   # this does not work

You might see, instead, that after fitting your model, you'll be able to access the support_ and ranking_ attributes:

models['DecisionTreeRegressor_2'].fit(X,y)
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_   # this works
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_   # this works

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1