'tensorflow: NLP automatic text generator always prints the same word

I apologize in advance but I just started exploring the world of NLP text generator. After training a neural network on a text, I am trying to generate new text based on this model and an initial sentence. No matter which starting sentence (seed text) I start with, the next automatically generated words are all and. I do not understand why and how to fix this problem. Again, I am very new to this, so I would appreciate any kind of help.

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 
import matplotlib.pyplot as plt

data=[' Hello, Chicago.If there is anyone out there who still doubts that America is a place where all things are possible,' ,
'who still wonders if the dream of our founders is alive in our time, who still questions the power of our democracy, tonight is your answer.',
'It’s the answer told by lines that stretched around schools and churches in numbers this nation has never seen,',
'by people who waited three hours and four hours, many for the first time in their lives, ',
'because they believed that this time must be different, that their voices could be that difference.',
'It’s the answer spoken by young and old, rich and poor, Democrat and Republican, black, white,',
'Hispanic, Asian, Native American, gay, straight, disabled and not disabled.',
'Americans who sent a message to the world that we have never been just a collection of individuals',
'or a collection of red states and blue states.',
'We are, and always will be, the United States of America.']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

xs, labels =  input_sequences[:,:-1],input_sequences[:,-1]
ys=(tf.keras.utils.to_categorical(labels, num_classes=total_words))
nr_epochs=50
model = Sequential()
model.add(Embedding(total_words, nr_epochs, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(xs, ys, epochs=nr_epochs, verbose=1)
seed_text = "If there is anyone out there who still doubts"
next_words = 100
  
for _ in range(next_words):
    token_list1 = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list1], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted.all():
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text) # it prints the starting sentence plus 'and' 100 times

Solution 1:^[1]

You can use unittest.mock.patch to patch google.oauth2.service_account module and googleapiclient.discovery.build function.

E.g.

example.py:

from google.oauth2 import service_account
from googleapiclient.discovery import build

def get_data_from_google_sheets(spreadsheet_id, google_creds):
    scopes = ['https://www.googleapis.com/auth/sqlservice.admin','https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
    range_name = 'A1:AA1000'
    credentials = service_account.Credentials.from_service_account_info(google_creds, scopes = scopes)
    service = build('sheets', 'v4', credentials = credentials)
    sheet = service.spreadsheets()
    result_from_sheet = sheet.values().get(spreadsheetId = spreadsheet_id,range = range_name).execute()
    rows = result_from_sheet['values'][1:]
    return rows

test_example.py:

from unittest import TestCase
import unittest
from unittest.mock import Mock, patch
from example import get_data_from_google_sheets


class TestExample(TestCase):
    @patch('example.build')
    @patch('example.service_account.Credentials')
    def test_get_data(self, mock_service_acount_credentials, mock_build):
        sheet_id = "123"
        creds = {"b": "d"}
        mock_service_acount_credentials.from_service_account_info.return_value = '123'
        mock_build.return_value.spreadsheets.return_value.values.return_value.get.return_value.execute.return_value = {
            'values': []}
        result = get_data_from_google_sheets(sheet_id, creds)
        self.assertEqual(len(result), 0)


if __name__ == '__main__':
    unittest.main()

Test result:

.
----------------------------------------------------------------------
Ran 1 test in 0.004s

OK
Name                                         Stmts   Miss  Cover   Missing
--------------------------------------------------------------------------
src/stackoverflow/70791609/example.py           11      0   100%
src/stackoverflow/70791609/test_example.py      16      0   100%
--------------------------------------------------------------------------
TOTAL                                           27      0   100%

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source
Solution 1	slideshowp2

'tensorflow: NLP automatic text generator always prints the same word

Solution 1:[1]

Sources

Related Questions

Solution 1:^[1]