'(Machine Learning) Unable to convert array of bytes/strings into decimal numbers with dtype='numeric'

Im trying to do some machine learning to predict the outcome of UFC matches, i have alot of data and the error appears on line 48. I believe the problem is due to the first 3 columns are of data type string so it cannot convert it into numeric. I have tried Label encoding but that just prints more errors. Is one-hot encoding the way forward and if so how and where do i put it into my program? Any help is hugely appreciated.

from sklearn import preprocessing
from sklearn.datasets import load_iris

data = load_iris()
features = data.data
targets = data.target

import pandas as pd
import numpy as np

data = pd.read_csv("completedata.csv")
data = np.array(data)

"encoder = preprocessing.LabelEncoder()"

"features[:, 0] = encoder.fit_transform(features[:, 0])"#LabelEncoding attempt


features = data[:, 0:41]
targets = data[:, 41]


import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

fh = open("completedatatrain.txt", "r")
lines = fh.readlines()
fh.close()

features = []
targets = []

for line in lines:
    line = line.strip()
    line = line.split("\t")

    features.append(line[0:41])
    targets.append(line[41])

features = np.array(features)
targets = np.array(targets)

featuresTrain, featuresTest, targetsTrain, targetsTest = train_test_split(features, targets, test_size=0.2)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(featuresTrain, targetsTrain) #The error prints at here!

predictions = knn.predict(featuresTest)
print(predictions)

accuracy = accuracy_score(targetsTest, predictions)
print(accuracy)





fh = open("testingexample1.txt", "r")
lines = fh.readlines()
fh.close()

features = []

for line in lines:
    line = line.strip()
    line = line.split("\t")

    features.append(line)

holdBack = np.array(features)

predictions = knn.predict(holdBack)


Solution 1:[1]

You have to Label encode the fighters, weight_class, and method columns since these columns are containing strings

Try it and let me know if it works

from sklearn import preprocessing
from sklearn.datasets import load_iris

data = load_iris()
features = data.data
targets = data.target

import pandas as pd
import numpy as np

# data = pd.read_csv("completedata.csv")
data = np.array(data)

"encoder = preprocessing.LabelEncoder()"

"features[:, 0] = encoder.fit_transform(features[:, 0])"#LabelEncoding attempt


# features = data[ :41]
# targets = data[:, 41]


import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

fh = open("completedatatrain.txt", "r")
lines = fh.readlines()
fh.close()

features = []
targets = []

for line in lines:
    line = line.strip()
    line = line.split("\t")

    features.append(line[0:41])
    targets.append(line[41])

Here is the added code

categorical = ["write your cat columns here"]
#label encode your categorical columns
le = preprocessing.LabelEncoder()
for i in range(len(categorical)):
    features[:, i] = le.fit_transform(features[:, i])
features = np.array(features)
targets = np.array(targets)

featuresTrain, featuresTest, targetsTrain, targetsTest = train_test_split(features, targets, test_size=0.2)

knn = KNeighborsClassifier(n_neighbors=5)


knn.fit(featuresTrain, targetsTrain) #The error prints at here!

predictions = knn.predict(featuresTest)
print(predictions)

accuracy = accuracy_score(targetsTest, predictions)
print(accuracy)



# fh = open("testingexample1.txt", "r")
lines = fh.readlines()
fh.close()

features = []

for line in lines:
    line = line.strip()
    line = line.split("\t")

    features.append(line)

holdBack = np.array(features)

predictions = knn.predict(holdBack) 

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Kyriakos