'LSTM time series prediction is not accurate

I'm now using LSTM to forecast the metrics(node_load1) data from Prometheus, my training data is every 20s a value, and the LSTM model predicts the value of node_load1 for the next hour. The training data dimension is [120000, 1],A small portion of the data from the training set is shown below:

2022-04-20 03:30:00,  0.79000
2022-04-20 03:30:20,  0.71000
2022-04-20 03:30:40,  0.51000
2022-04-20 03:31:00,  0.44000
2022-04-20 03:31:20,  0.37000
2022-04-20 03:31:40,  0.42000
2022-04-20 03:32:00,  0.66000
2022-04-20 03:32:20,  0.69000
2022-04-20 03:32:40,  0.84000
2022-04-20 03:33:00,  0.87000
2022-04-20 03:33:20,  0.69000
2022-04-20 03:33:40,  0.50000
2022-04-20 03:34:00,  0.35000

LSTM traning code:

class MetricPredictor:
"""docstring for Predictor."""

model_name = "lstm"
model_description = "Forecasted value from Lstm model"
model = None
predicted_df = None
metric = None

def __init__(self, metric, rolling_data_window_size="10d", number_of_feature=375, validation_ratio=0.2,
             parameter_tuning=False, sample_freq='120s'):
    """Initialize the Metric object."""
    self.metric = Metric(metric, rolling_data_window_size)
    self.number_of_features = number_of_feature
    self.scalar = MinMaxScaler(feature_range=(0, 1))
    self.parameter_tuning = parameter_tuning
    self.validation_ratio = validation_ratio
    self.sample_freq = sample_freq

def sample_data(self, data,):
    data_sam = data.resample(self.sample_freq, label='right', on='ds').mean()
    data_sam = data_sam.reset_index()
    return data_sam

def visualize_loss(self, history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs = range(len(loss))
    plt.figure()
    plt.plot(epochs, loss, "b", label="Training loss")
    plt.plot(epochs, val_loss, "r", label="Validation loss")
    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig('./train_val_loss.png')

def prepare_data(self, data):
    """Prepare the data for LSTM."""
    train_x = np.array(data[:, 1])[np.newaxis, :].T

    for i in range(self.number_of_features):
        train_x = np.concatenate((train_x, np.roll(data[:, 1], -i)[np.newaxis, :].T), axis=1)

    train_x = train_x[:train_x.shape[0] - self.number_of_features, :self.number_of_features]
    train_yt = np.roll(data[:, 1], -self.number_of_features + 1)
    train_y = train_yt[:train_yt.shape[0] - self.number_of_features + 1]
    train_x = train_x.reshape(train_x.shape[0], 1, train_x.shape[1])
    return train_x, train_y

def get_model(self, lstm_cell_count, dense_cell_count):
    """Build the model."""
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=(1, self.number_of_features)))
    model.add(Dropout(0.2))
    model.add(LSTM(lstm_cell_count))
    model.add(Dropout(0.2))
    model.add(Dense(dense_cell_count))
    model.add(Dense(1))
    return model

def train(self, metric_data=None, prediction_duration=15, prediction_freq="15s", isLongPeriod=True):
    """Train the model."""
    if metric_data:
        # because the rolling_data_window_size is set, this df should not bloat
        self.metric += Metric(metric_data)

    # resample the data
    if isLongPeriod:
        data_sampled = self.sample_data(self.metric.metric_values)
        self.metric.metric_values = data_sampled

    self.metric.metric_values = self.metric.metric_values.bfill()
    # normalising
    metric_values_np = self.metric.metric_values.values
    scaled_np_arr = self.scalar.fit_transform(metric_values_np[:, 1].reshape(-1, 1))
    metric_values_np[:, 1] = scaled_np_arr.flatten()

    self.lstm_cell_count = 32
    self.dense_cell_count = 64
    model = self.get_model(self.lstm_cell_count, self.dense_cell_count)
    data_x, data_y = self.prepare_data(metric_values_np)

    adam = optimizers.Adam(lr=0.0001)
    model.compile(loss='mean_squared_error', optimizer=adam) 
    history = model.fit(np.asarray(data_x).astype(np.float32), np.asarray(data_y).astype(np.float32),
              epochs=50, batch_size=512,validation_split=self.validation_ratio, verbose=1,shuffle=False)
    self.visualize_loss(history, "Training and Validation Loss")
    data_test = np.asarray(metric_values_np[-self.number_of_features:, 1]).astype(np.float32)
    print("number_of_features",self.number_of_features)
    forecast_values = []
    prev_value = data_test[-1]

    for i in range(prediction_duration): 
        prediction = model.predict(data_test.reshape(1, 1, self.number_of_features)).flatten()[0]
        curr_pred_value = prediction 
        scaled_final_value = self.scalar.inverse_transform(curr_pred_value.reshape(1, -1)).flatten()[0]
        forecast_values.append(scaled_final_value)
        data_test = np.roll(data_test, -1)
        data_test[-1] = curr_pred_value
        prev_value = data_test[-1]

    dataframe_cols = {"yhat": np.array(forecast_values)}

    upper_bound = np.array(
        [
            (
                    forecast_values[i] + (np.std(forecast_values[:i]) * 4)
            )
            for i in range(len(forecast_values))
        ]
    )
    upper_bound[0] = np.mean(
        forecast_values[0]
    )  # to account for no std of a single value
    lower_bound = np.array(
        [
            (
                    forecast_values[i] - (np.std(forecast_values[:i]) * 4)
            )
            for i in range(len(forecast_values))
        ]
    )
    lower_bound[0] = np.mean(
        forecast_values[0]
    )  # to account for no std of a single value
    dataframe_cols["yhat_upper"] = upper_bound
    dataframe_cols["yhat_lower"] = lower_bound

    data = self.metric.metric_values
    maximum_time = max(data["ds"])
    print("maximum_time",maximum_time)
    print("len forecast_values",len(forecast_values))

    dataframe_cols["timestamp"] = pd.date_range(
        maximum_time, periods=len(forecast_values), freq='20s'
    )
    if isLongPeriod:
        dataframe_cols["timestamp"] = pd.date_range(
            maximum_time, periods=len(forecast_values), freq='120s'
        )

    forecast = pd.DataFrame(data=dataframe_cols)
    forecast = forecast.set_index("timestamp")
    print(forecast)

    self.predicted_df = forecast

The training data for the past 30 days are shown in figure:

train data

and the loss of lstm on the training set and validation set is:

train loss and validation loss

Comparison of test set prediction results and real data prediction and true value in 1 hour

The lstm model works well on the training set and validation set, but it doesn't work well on the test set. Could you please provide advice and tell me where to improve the model?



Solution 1:[1]

There are a couple of syntactical issues with the code. Application arguments should be separate from the body and pub without a struct doesn't make sense either.

Unfortunately the documentation of their Rust interface is quite lacking (seems to be mostly "have a look at some examples then find out the rest through trial-and-error"). So I was unable to look up enough information to suggest a reasonably correct version.

Here are a couple of more pointers:

  • it's not clear what the input to this function is. You're referencing a msg object with a sender member there, but the only equivalent I could identify was the &[AccountInfo] argument which identifies the invoking account.
  • Alternatively, Solana programs receive a byte array of instruction data which apparently can have any content encoded within them.

I would suggest starting with their Hello World example, playing around with it a bit and continue with your own app once you're more familiar with Rust syntax and Solana best practices.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 milgner