'Model works perfectly but GridSearch causes error
While working on a project I have come across a weird error, where fitting my model works perfectly but when I apply gridsearch
it gives me an error.
The code puts all the necessary objects created and uses them in the pipeline.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import time
from numpy.fft import fft
class DataPreprocess(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
self.X_m = X.merge(y, on= ['year', 'weekofyear'])
return self
def transform(self, X):
dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
unix = []
for i in dt:
unix.append(time.mktime(i.timetuple()))
X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
return X_t
class FourierComponents(BaseEstimator, TransformerMixin):
"""creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
def __init__(self, n=10):
self.n = n
def fit(self, X, y= None):
self.labels = X['total_cases']
self.Y_t = fft(self.labels - (self.labels).mean())
self.Y_t = self.Y_t[:len(self.labels)//2]
self.ind_max = np.abs(self.Y_t).argsort()
self.t_span = len(self.labels)
self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
self.f_ind = self.f[self.ind_max]
self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
return self
def transform(self, X):
Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
for i, f in enumerate(self.f_ind[-self.n:]):
Xt[:, 2*i] = np.cos(2*np.pi*f*self.ind).reshape(-1)
Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
return Xt
Unixdata = DataPreprocess()
fourier = FourierComponents()
model = Pipeline([
('indices', Unixdata),
('fourier', fourier),
('scalar', StandardScaler()),
('regressor', Ridge())
])
param_grid = {'fourier__n' : list(range(3,5)),
'regressor__alpha' : np.logspace(1, 4, 20)}
grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')
grid_search.fit(sj_train_features, sj_train_labels)
fitting the grid_search
here gives me this error:
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 for name, scorer in self._scorers.items():
86 if isinstance(scorer, _BaseScorer):
---> 87 score = scorer._score(cached_call, estimator,
88 *args, **kwargs)
89 else:
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
210 **self._kwargs)
211 else:
--> 212 return self._sign * self._score_func(y_true, y_pred,
213 **self._kwargs)
214
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
176 0.85...
177 """
--> 178 y_type, y_true, y_pred, multioutput = _check_reg_targets(
179 y_true, y_pred, multioutput)
180 check_consistent_length(y_true, y_pred, sample_weight)
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
82
83 """
---> 84 check_consistent_length(y_true, y_pred)
85 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
86 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
254 uniques = np.unique(lengths)
255 if len(uniques) > 1:
--> 256 raise ValueError("Found input variables with inconsistent numbers of"
257 " samples: %r" % [int(l) for l in lengths])
258
ValueError: Found input variables with inconsistent numbers of samples: [188, 748]
but
model.fit(sj_train_features, sj_train_labels)
fits perfectly.
Now I am wondering why and where is the mistake in the code? Can anyone point me in the right direction?
A small example (hopefully representative):
sj_train_features = pd.DataFrame({
'year': [1990] * 10,
'weekofyear': np.arange(18, 28),
'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
'ndvi_ne': np.random.random(10),
'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])
sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')
Solution 1:[1]
A little more than halfway through your traceback, you see a snippet from _fit_and_score
, which indicates that the fitting has succeeded, but that scoring is what fails. Indeed, when I call model.predict
, I always get out the same length array as the training set. When comparing that against the true labels then, the scorer correctly complains that the number of predictions does not match the number of labels.
I don't entirely understand what your FourierComponents
is supposed to do, but I think its transform
method needs to pay attention to the time index.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Ben Reiniger |