'Multinominal Naive Bayes Works with one example but not the other (sciki-learn)

I'm just doing a super simple BBC news classifier to learn. I found this one code example online and it worked (it predicts if a town is in the countryside or by the city):

data = [{'house': 100, 'street': 50, 'shop': 25, 'car': 100, 'tree': 20},
{'house': 5, 'street': 5, 'shop': 0, 'car': 10, 'tree': 500, 'river': 1}]

dv = DictVectorizer(sparse=False)
X = dv.fit_transform(data)
Y = np.array([1, 0])

mnb = MultinomialNB()
mnb.fit(X, Y)

test_data = data = [{'house': 80, 'street': 20, 'shop': 15, 'car': 70, 'tree': 10, 'river': 
1}, {'house': 10, 'street': 5, 'shop': 1, 'car': 8, 'tree': 300, 'river': 0}]

mnb.predict(dv.fit_transform(test_data))

So I take this and I apply it to BBC I found here with this code here:

df_train = pd.read_csv("BBC-News-Train.csv")
df_test = pd.read_csv("bbc_train_jack.csv")

def clean_text(text):
    # remove everything except alphabets
    text = re.sub("[^a-zA-Z]", " ", text)
    # remove whitespaces
    text = ' '.join(text.split())
    text = text.lower()
    
    return text

df_train['clean_text'] = df_train['Text'].apply(clean_text)
df_test['clean_text'] = df_test['Text'].apply(clean_text)

stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return no_stopword_text

df_train['clean_text'] = df_train['clean_text'].apply(lambda x: remove_stopwords(x))
df_test['clean_text'] = df_test['clean_text'].apply(lambda x: remove_stopwords(x))

def get_id(x):
    y = 0
    if x == 'business':
        y = 0
    if x == 'entertainment':
        y = 1
    if x == 'tech':
        y = 2
    if x == 'politics':
        y = 3
    if x == 'sport':
        y = 4
    return y

df_train['category_id'] = df_train['Category'].apply(get_id)
df_test['category_id'] = df_test['Category'].apply(get_id)

def makeWordCount(listOfWords):
    counts = Counter(listOfWords)
    return dict(counts.most_common(5))

df_train['word_count'] = df_train['clean_text'].apply(makeWordCount)
df_test['word_count'] = df_test['clean_text'].apply(makeWordCount)

def getLen(d):
    return len(d)

df_train['len_word_count'] = df_train['word_count'].apply(getLen)
df_test['len_word_count'] = df_test['word_count'].apply(getLen)

print(df_test)

data = df_train['word_count'].to_list()

print(data[:10])
print("")

dv = DictVectorizer(sparse=False)
X = dv.fit_transform(data)
y = np.array(df_train['category_id'].to_list())

clf = MultinomialNB()
clf.fit(X, y)

test_data = data = [{'eviction': 6, 'said': 4, 'show': 13, 'chart': 9, 'radio': 7}]

clf.predict(dv.fit_transform(test_data))

But I'm getting this error:

ValueError: X has 5 features, but MultinomialNB is expecting 2121 features as input.

Why is this happening? The test is me just dividing the files into two different ones



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source