'How do I optimise a process given the ML model has also been tested and fitted?
I've spotted checked/grid searched several regression models in python to predict an single continuous output from 5 controlled variable inputs, it does well (95% accuracy). I've of course selected the best optimised algorithm given by spot check and grid search/Hyperparameter opt and now I have a fully working model that is able to make predictions.
How can I use the model to calculate which set of controlled variable will give me the desired output?
In a sense of optimised - Which design area should I operate my 5 controlled variable input (X1,X2,X3,X4,X5) that will likely give me a desired output (Y)?
In other words I want to have an end point Y and use the model to calculate which set of X1,X2,X3,X4,X5 I should use.
Any snippets?
Here is my code:
# load the dataset, returns X and y elements
def load_dataset():
return X, Y
# create a dict of standard models to evaluate {name:object}
def get_models(models=dict()):
# linear models
models['Linear Regression'] = LinearRegression()
models['Least Angle'] = LassoLars()
models['Ridge'] = Ridge()
models['Elastic Net'] = ElasticNet()
models['HuberRegressor'] = HuberRegressor()
models['Support Vector Machine'] = SVR()
# non-linear models
n_neighbors = range(1, 20)
for k in n_neighbors:
models['K-neighbours'+ str(k)] = KNeighborsRegressor(n_neighbors=k)
models['Decision Tree'] = DecisionTreeRegressor()
models['Extra Tree'] = ExtraTreeRegressor()
# Ensemble models
models['Bagging'] = BaggingRegressor(n_estimators=100)
models['Random Forest'] = RandomForestRegressor(n_estimators=100)
# Wrapped Models
tree = range(1,1000,500)
for i in tree:
models['Wrapped Extra Tree'+ str(i)]= RegressorChain(ExtraTreesRegressor(n_estimators=i))
models['Wrapped Random Forest'+ str(i)]= RegressorChain( RandomForestRegressor(n_estimators=i))
models['Wrapped Bagging'+ str(i)]= RegressorChain( BaggingRegressor(n_estimators=i))
print('Defined %d models' % len(models))
return models
# create a feature preparation pipeline for a model
def make_pipeline(model):
steps = list()
# standardization
steps.append(('standardize', StandardScaler()))
# normalization
steps.append(('normalize', MinMaxScaler()))
# the model
steps.append(('model', model))
# create pipeline
pipeline = Pipeline(steps=steps)
return pipeline
# evaluate a single model
def evaluate_model(X, Y, model, folds, metric):
# create the pipeline
pipeline = make_pipeline(model)
# evaluate model
scores = cross_val_score(pipeline, X, Y, scoring=metric, cv=folds, n_jobs=-1)
return scores
# evaluate a model
def robust_evaluate_model(X, Y, model, folds, metric):
scores = None
try:
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
scores = evaluate_model(X, Y, model, folds, metric)
except:
scores = None
return scores
# evaluate a dict of models {name:object}, returns {name:score}
def evaluate_models(X, Y, models, folds=5, metric='accuracy'):
results = dict()
for name, model in models.items():
# evaluate the model
scores = robust_evaluate_model(X, Y, model, folds, metric)
# show process
if scores is not None:
# store a result
results[name] = scores
n_scores = absolute(scores)
mean_score, std_score = mean(n_scores),std(n_scores)
print('>%s: %.3f (+/-%.3f)' % (name, mean_score, std_score))
else:
print('>%s: error' % name)
return results
# print and plot the top n results
def summarize_results(results, maximize=True, top_n=10):
# check for no results
if len(results) == 0:
print('no results')
return
# determine how many results to summarize
n = min(top_n, len(results))
# create a list of (name, mean(scores)) tuples
mean_scores = [(k,mean(v)) for k,v in results.items()]
# sort tuples by mean score
mean_scores = sorted((mean_scores), key=lambda x: x[1])
# reverse for descending order (e.g. for accuracy)
if maximize:
mean_scores = list(reversed(mean_scores))
# retrieve the top n for summarization
names = [x[0] for x in mean_scores[:n]]
scores = [results[x[0]] for x in mean_scores[:n]]
# print the top n
print()
for i in range(n):
name = names[i]
mean_score, std_score = mean(results[name]), std(results[name])
print('Rank=%d, Name=%s, Score=%.3f (+/- %.3f)' % (i+1, name, mean_score, std_score))
# boxplot for the top n
fig = plt.figure(figsize=(20,10))
pyplot.boxplot(scores,labels=names,)
_, labels = pyplot.xticks()
pyplot.setp(labels, rotation=70)
pyplot.ylabel('Mean Average Error % (MAE)')
pyplot.title('Machine Learning Model Selection',)
pyplot.savefig('spotcheck.png',bbox_inches='tight',)
# load dataset
X, Y= load_dataset()
# get model list
models = get_models()
# evaluate models
results = evaluate_models(X, Y, models, metric='neg_mean_squared_error')
# summarize results
summarize_results(results)
# fit the model on the whole datase
Model_5=HuberRegressor()
# fit the model on the whole dataset
Model_5.fit(X, Y)
# make a single prediction with known controlled variables.
row = [58.3447131,
1.25,
0.05,
1.3,
7.2]
yhat = Model_5.predict([row])
# summarize the prediction
print('Predicted: %s' % yhat[0])```
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
