'Linear Regression Practice
I'm trying to run a basic linear regression, However I've encountered en error like this when I run my code.
Traceback (most recent call last):
File "/Users/brian_kang/Documents/Webtools/Project_Alliance_Data/Code/Python/Predictive_modelling/predictive_modelling.py", line 113, in <module>
linear_regression(train_x, train_y, test_x, test_y)
NameError: name 'train_x' is not defined
The dataset is the one I create myself as a mock data having no missing value.
# This is a predictive modelling for animal weight
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMClassifier
from xgboost import XGBRegressor
os.getcwd()
############################ Exploratory Data Analysis ##################################
def histo_plot(dataframe):
# Histogram for Kill Weight
plot = sns.displot(dataframe['Kill_Weight'])
return plot
def box_plot(dataframe):
# Boxplot Kill Weight over Animal Code
# Animal Code = 100: "Cattle", 200: "Calves", 300: "Sheep", 400: "Lambs", 500: "Deer")
var1 = 'Animal_Code'
var2 = 'Kill_Weight'
data = pd.concat([dataframe[var2], dataframe[var1]], axis=1)
f, ax = plt.subplots(figsize=(16,8))
fig = sns.boxplot(x=var1, y=var2, data = data)
plot = fig.axis(ymin = 0, ymax = 800)
return plot
def check_missing(dataframe):
# Check missing values
df_na = (dataframe.isnull().sum() / len(dataframe)) * 100
# Sorting by percentages by top 20
df_na = df_na.sort_values(ascending = False)[:20]
df_na = pd.DataFrame({"Missing %" : df_na})
return df_na
def corr_heatmap(dataframe):
# correlation matrix
corrmat = dataframe.corr()
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.8, square=True)
k = 10
cols = corrmat.nlargest(k, 'Kill_Weight')['Kill_Weight'].index
cm = np.corrcoef(dataframe[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size':10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
return None
def label_encode(dataframe):
# Encoding
label_encoder = LabelEncoder()
dataframe['Region'] = label_encoder.fit_transform(dataframe['Region'])
dataframe['Season'] = label_encoder.fit_transform(dataframe['Season'])
dataframe['Landtype'] = label_encoder.fit_transform(dataframe['Landtype'])
dataframe['Region'].value_counts().sort_values(ascending=False)
# df['Date'] = pd.to_datetime(df['Date'])
return dataframe
def split_data(dataframe):
# Split Train Test
train, test = train_test_split(dataframe, test_size=0.3, random_state=42)
train_x = train.drop(['Kill_Weight'], 1)
train_y = train['Kill_Weight']
test_x = test.drop(['Kill_Weight'], 1)
test_y = test['Kill_Weight']
return train_x, train_y, test_x, test_y
def linear_regression(train_X, train_Y, test_X, test_Y):
# Linear regression
lr = LinearRegression(n_jobs = -1)
lr.fit(train_X, train_Y)
prediction = lr.predict(test_X)
print('fit...')
# print(MAE(train_y, lr.predict(test_x)))
print_Metrics(test_Y, prediction)
return prediction
def print_Metrics(test, pred):
print("MAE:", metrics.mean_absolute_error(test, pred))
print('MSE:', metrics.mean_squared_error(test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(test, pred)))
return None
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
if __name__ == '__main__':
# Read the dataset
df = pd.read_csv('../../../Data/SampleData_Predictive_Modelling_scaleup.csv')
df_sub = df[
["Date", "FarmID", "Animal_Code", "Birth_Weight", "Kill_Weight", "Ages_month", "Region", "Landtype", "Season",
"Temp_max", "Temp_min", "Temp_avg", "Rainfall_mm", "Sunshine_Hour"]]
histo_plot(df_sub)
box_plot(df_sub)
check_missing(df_sub)
corr_heatmap(df_sub)
linear_regression(train_x, train_y, test_x, test_y)
label_encode(df_sub)
split_data(df_sub)
Solution 1:[1]
I do not have the reputation for commenting, sorry :(
You seem to have everything in place, it's your main function that's need a bit of fixing.
You call the function:
linear_regression(train_x, train_y, test_x, test_y)
before the function that actually gives you those inputs:
split_data(df_sub)
I recommend fixing it like so:
(train_x, train_y, test_x, test_y) = split_data(df_sub)
then:
linear_regression(train_x, train_y, test_x, test_y)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Tr??ng Qu?c Quân |
