'How to properly use SMOTE for data balancing

I wanted to know if it is required to use SMOTE only after splitting test and train dataset. I used smote after train_test_split for Churn prediction, but haven't got any significant improvement pre or post SMOTE. Below is my entire code using smote. Not sure where the issue is. I wanted to know if I used SMOTE properly.

Below is the code

import pandas as pd
import numpy as np
from datetime import timedelta,datetime,date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from numpy import percentile

tel_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
tel_data.info()
tel_data.isnull().sum()

num = {"No":0,"Yes":1}
tel_data = tel_data.replace({"Churn":num})

# also total charges seem to be object. coverting to integer
tel_data['TotalCharges'] = pd.to_numeric(tel_data['TotalCharges'])
tel_data.head(2)
tel_data['Churn'].value_counts()
plt.figure(figsize=(6,5))
sns.countplot(tel_data['Churn'])
plt.show()
# using pd.to_numeric to convert the TotalCharges column to numeric will help us see the null values
tel_data.TotalCharges = pd.to_numeric(tel_data.TotalCharges, errors="coerce")
tel_data.isnull().sum()
# deleting the rows with null values
tel_data = tel_data.dropna(axis=0)

# encoding all categorical variables using one hot encoding

tel_data = pd.get_dummies(tel_data,drop_first=True,columns=['gender','Partner','Dependents',
                                            'PhoneService','MultipleLines','InternetService',
                                           'OnlineSecurity','OnlineBackup','DeviceProtection',
                                           'TechSupport','StreamingTV','StreamingMovies',
                                           'Contract','PaperlessBilling','PaymentMethod'])

# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['customerID','Churn'],axis=1)
y = tel_data['Churn']

# performing feature selection using chi2 test
from sklearn.feature_selection import chi2
chi_scores = chi2(X,y)
print('chi_values:',chi_scores[0],'\n')
print('p_values:',chi_scores[1])

p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)

plt.figure(figsize=(12,8))
p_values.plot.bar()
plt.show()
tel_data.drop(['PhoneService_Yes','gender_Male','MultipleLines_No phone service','MultipleLines_Yes','customerID'],axis=1,inplace=True)
tel_data.head(2)

# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['Churn'],axis=1)
y = tel_data['Churn']

# import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

from sklearn.metrics import accuracy_score

# splitting into train and test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

model_xgb_1 = xgb.XGBClassifier(n_estimators=100,    
                                learning_rate=0.3,
                                max_depth=5,
                                random_state=42 )
xgbmod = model_xgb_1.fit(X_train,y_train)

# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
       .format(xgbmod.score(X_train, y_train)))

y_xgb_pred = trn_xgbmod.predict(X_test)
print(classification_report(y_test, y_xgb_pred))

from imblearn.over_sampling import SMOTE

smote_preprocess = SMOTE(random_state=42)
X_train_resampled,y_train_resampled = smote_preprocess.fit_resample(X_train,y_train)

model_xgb_smote = xgb.XGBClassifier(n_estimators=100,    
                                learning_rate=0.3,
                                max_depth=5,
                                random_state=42 )
xgbmod_smote = model_xgb_smote.fit(X_train_resampled,y_train_resampled)

# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
       .format(xgbmod_smote.score(X_train_resampled,y_train_resampled)))

y_xgb_pred_smote = xgbmod_smote.predict(X_test)
print(classification_report(y_test, y_xgb_pred_smote))


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source