'Linear Regression Practice

I'm trying to run a basic linear regression, However I've encountered en error like this when I run my code.

Traceback (most recent call last):
  File "/Users/brian_kang/Documents/Webtools/Project_Alliance_Data/Code/Python/Predictive_modelling/predictive_modelling.py", line 113, in <module>
    linear_regression(train_x, train_y, test_x, test_y)
NameError: name 'train_x' is not defined

The dataset is the one I create myself as a mock data having no missing value.

# This is a predictive modelling for animal weight

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMClassifier
from xgboost import XGBRegressor

os.getcwd()

############################ Exploratory Data Analysis ##################################

def histo_plot(dataframe):
    # Histogram for Kill Weight
    plot = sns.displot(dataframe['Kill_Weight'])
    return plot

def box_plot(dataframe):
    # Boxplot Kill Weight over Animal Code
    # Animal Code = 100: "Cattle", 200: "Calves", 300: "Sheep", 400: "Lambs", 500: "Deer")
    var1 = 'Animal_Code'
    var2 = 'Kill_Weight'
    data = pd.concat([dataframe[var2], dataframe[var1]], axis=1)
    f, ax = plt.subplots(figsize=(16,8))
    fig = sns.boxplot(x=var1, y=var2, data = data)
    plot = fig.axis(ymin = 0, ymax = 800)
    return plot

def check_missing(dataframe):
    # Check missing values
    df_na = (dataframe.isnull().sum() / len(dataframe)) * 100
    # Sorting by percentages by top 20
    df_na = df_na.sort_values(ascending = False)[:20]
    df_na = pd.DataFrame({"Missing %" : df_na})
    return df_na

def corr_heatmap(dataframe):
    # correlation matrix
    corrmat = dataframe.corr()
    f, ax = plt.subplots(figsize=(12,9))
    sns.heatmap(corrmat, vmax=0.8, square=True)

    k = 10
    cols = corrmat.nlargest(k, 'Kill_Weight')['Kill_Weight'].index
    cm = np.corrcoef(dataframe[cols].values.T)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size':10}, yticklabels=cols.values, xticklabels=cols.values)
    plt.show()
    return None

def label_encode(dataframe):
    # Encoding
    label_encoder = LabelEncoder()
    dataframe['Region'] = label_encoder.fit_transform(dataframe['Region'])
    dataframe['Season'] = label_encoder.fit_transform(dataframe['Season'])
    dataframe['Landtype'] = label_encoder.fit_transform(dataframe['Landtype'])
    dataframe['Region'].value_counts().sort_values(ascending=False)
    # df['Date'] = pd.to_datetime(df['Date'])
    return dataframe

def split_data(dataframe):
    # Split Train Test
    train, test = train_test_split(dataframe, test_size=0.3, random_state=42)
    train_x = train.drop(['Kill_Weight'], 1)
    train_y = train['Kill_Weight']
    test_x = test.drop(['Kill_Weight'], 1)
    test_y = test['Kill_Weight']
    return train_x, train_y, test_x, test_y

def linear_regression(train_X, train_Y, test_X, test_Y):
    # Linear regression
    lr = LinearRegression(n_jobs = -1)
    lr.fit(train_X, train_Y)
    prediction = lr.predict(test_X)
    print('fit...')
    # print(MAE(train_y, lr.predict(test_x)))
    print_Metrics(test_Y, prediction)
    return prediction

def print_Metrics(test, pred):
    print("MAE:", metrics.mean_absolute_error(test, pred))
    print('MSE:', metrics.mean_squared_error(test, pred))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(test, pred)))
    return None


# See PyCharm help at https://www.jetbrains.com/help/pycharm/
if __name__ == '__main__':

    # Read the dataset
    df = pd.read_csv('../../../Data/SampleData_Predictive_Modelling_scaleup.csv')
    df_sub = df[
        ["Date", "FarmID", "Animal_Code", "Birth_Weight", "Kill_Weight", "Ages_month", "Region", "Landtype", "Season",
         "Temp_max", "Temp_min", "Temp_avg", "Rainfall_mm", "Sunshine_Hour"]]

    histo_plot(df_sub)
    box_plot(df_sub)
    check_missing(df_sub)
    corr_heatmap(df_sub)

    linear_regression(train_x, train_y, test_x, test_y)
    label_encode(df_sub)
    split_data(df_sub) 


Solution 1:[1]

I do not have the reputation for commenting, sorry :(

You seem to have everything in place, it's your main function that's need a bit of fixing.

You call the function:

linear_regression(train_x, train_y, test_x, test_y)

before the function that actually gives you those inputs:

split_data(df_sub)

I recommend fixing it like so:

(train_x, train_y, test_x, test_y) = split_data(df_sub)

then:

linear_regression(train_x, train_y, test_x, test_y)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Tr??ng Qu?c Quân