'Sklearn: only size-1 arrays can be converted to Python scalars

When I tried to take the word_vector transformed from Chinese as the feature of sklearn,an error occurred. The shape of x_train and word_vector are (747,) and (1,100) and the latter's dtype is float64

for this question, I guess the type of the data may be different, but i tried to traverse all the data, it was ok ……

Here are the code:


import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
import SZ_function as sz
import gensim
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

def remove_stop_words(text):
    stop_words = sz.get_step_words('notebook/HIT.txt')
    text = text.split()
    word_list = ''
    for word in text:
        if word not in stop_words:
            word_list += word
            word_list += ' '
    return  word_list

def pre_process(path):
    data = pd.read_excel(path)
    data['text'] =  data['text'].apply(sz.remove_number_en)
    data['text'] = data['text'].apply(sz.cut_words)
    data['text'] = data['text'].apply(remove_stop_words)
    data = data.replace(to_replace='', value='None')
    data = data.replace(to_replace='None', value=np.nan).dropna()
    return data

def create_corpus(data):
    text = data['text']
    return [sentences.split() for sentences in text]

def word_vec(corpus):
    model = gensim.models.word2vec.Word2Vec(corpus)
    return model

def get_sent_vec(sent,model,size):
    vec = np.zeros(size).reshape((1,size))
    count = 0
    for word in sent[1:]:
        try:
            vec += model.wv[word].reshape((1,size))
            count += 1
        except:
            continue
    if count != 0:
        vec /= count
    return  vec


if __name__ == '__main__':
    data = pre_process('datasets_demo.xlsx')
    corpus = create_corpus(data)
    model = word_vec(corpus)
    data['text']=data['text'].apply(get_sent_vec,model=model,size=100)
    x_train,y_train,x_test,y_test = train_test_split(data['text'],data['label'])
    estimator = MultinomialNB()
    estimator.fit(x_train,y_train)

here are the all trackback:

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\12996\AppData\Local\Temp\jieba.cache
Loading model cost 0.628 seconds.
Prefix dict has been built successfully.
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
  File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-8366eff678ac>", line 1, in <module>
    runfile('C:/Users/12996/Desktop/Tensorflow_/datasets_demo.py', wdir='C:/Users/12996/Desktop/Tensorflow_')
  File "E:\pycharm\PyCharm 2022.1\plugins\python\helpers\pydev\_pydev_bundle\pydev_umd.py", line 198, in runfile
    pydev_imports.execfile(filename, global_vars, local_vars)  # execute the script
  File "E:\pycharm\PyCharm 2022.1\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:/Users/12996/Desktop/Tensorflow_/datasets_demo.py", line 66, in <module>
    estimator.fit(x_train,y_train)
  File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\naive_bayes.py", line 663, in fit
    X, y = self._check_X_y(X, y)
  File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\naive_bayes.py", line 523, in _check_X_y
    return self._validate_data(X, y, accept_sparse="csr", reset=reset)
  File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\utils\validation.py", line 976, in check_X_y
    estimator=estimator,
  File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\utils\validation.py", line 746, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\pandas\core\series.py", line 857, in __array__
    return np.asarray(self._values, dtype)
ValueError: setting an array element with a sequence.


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source