'sklearn pipeline is not working
I am new to sklearn pipeline and studying about it from sklearn documentation. I used it in sentiment analysis on movie review data. Data contains two columns, first class and second text.
input_file_df = pd.read_csv("movie-pang.csv")
x_train = input_file_df["text"] #used complete data as train data
y_train = input_file_df["class"]
I used only one feature, sentiment score for each sentence. I wrote custom transformer for this:
class GetWorldLevelSentiment(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def get_word_level_sentiment(self, word_list):
sentiment_score = 1
for word in word_list:
word_sentiment = swn.senti_synsets(word)
if len(word_sentiment) > 0:
word_sentiment = word_sentiment[0]
else:
continue
if word_sentiment.pos_score() > word_sentiment.neg_score():
word_sentiment_score = word_sentiment.pos_score()
elif word_sentiment.pos_score() < word_sentiment.neg_score():
word_sentiment_score = word_sentiment.neg_score()*(-1)
else:
word_sentiment_score = word_sentiment.pos_score()
print word, " " , word_sentiment_score
if word_sentiment_score != 0:
sentiment_score = sentiment_score * word_sentiment_score
return sentiment_score
def transform(self, review_list, y=None):
sentiment_score_list = list()
for review in review_list:
sentiment_score_list.append(self.get_word_level_sentiment(review.split()))
return np.asarray(sentiment_score_list)
def fit(self, x, y=None):
return self
Pipeline which I used is:
pipeline = Pipeline([
("word_level_sentiment",GetWorldLevelSentiment()),
("clf", MultinomialNB())])
and then call fit on pipeline:
pipeline.fit(x_train, y_train)
But this is giving following error to me:
This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.
Can someone please guide me what I am doing wrong here ?? It will be a great help.
Solution 1:[1]
This worked for me:
class GetWorldLevelSentiment(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def get_word_level_sentiment(self, word_list):
sentiment_score = 1
for word in word_list:
word_sentiment = swn.senti_synsets(word)
if len(word_sentiment) > 0:
word_sentiment = word_sentiment[0]
else:
continue
if word_sentiment.pos_score() > word_sentiment.neg_score():
word_sentiment_score = word_sentiment.pos_score()
elif word_sentiment.pos_score() < word_sentiment.neg_score():
word_sentiment_score = word_sentiment.neg_score()*(-1)
else:
word_sentiment_score = word_sentiment.pos_score()
print word, " " , word_sentiment_score
if word_sentiment_score != 0:
sentiment_score = sentiment_score * word_sentiment_score
return sentiment_score
def transform(self, review_list, y=None):
sentiment_score_list = list()
for review in review_list:
sentiment_score_list.append(self.get_word_level_sentiment(review.split()))
return pandas.DataFrame(sentiment_score-list)
def fit(self, x, y=None):
return self
Solution 2:[2]
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pandas as pd
# ????????? ?????????????
class TextTransformer(BaseEstimator, TransformerMixin):
"""
?????????????? ????????? ?????????
"""
def __init__(self, key):
self.key = key
def fit(self, X, y=None, *parg, **kwarg):
return self
def transform(self, X):
return X[self.key]
# ???????? ?????????????
class NumberTransformer(BaseEstimator, TransformerMixin):
"""
?????????????? ???????? ?????????
"""
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, X):
return X[[self.key]]
def fit_predict(model, X_train, X_test, y_train):
# ???????????? ????????? ???????????? ???????? ??????? ?????????,
vec_tdidf = CountVectorizer(ngram_range=(2,3), max_df=0.93, min_df=0.05)
#????????? ??????? clean
text = Pipeline([
('transformer', TextTransformer(key='clear_text')),
('vectorizer', vec_tdidf)
])
#???????? ??????? word_clean_count
word_numeric = Pipeline([
('transformer', NumberTransformer(key='word_count'))
])
posting_day = Pipeline([
('transformer', NumberTransformer(key='posting_day'))
])
posting_month = Pipeline([
('transformer', NumberTransformer(key='posting_month'))
])
post_theme = Pipeline([
('transformer', NumberTransformer(key='theme'))
])
# ??????????? ???? ?????????
features = FeatureUnion([('Text_Feature', text),
('Num1_Feature', word_numeric),
('Num3_Feature', posting_day),
('Num4_Feature', posting_month),
('Num6_Feature', post_theme)
])
# ?????????????
clf = model
# ??????????? ?????????????? ? ?????????
pipe = Pipeline([('features', features),
('clf',clf)
])
# ???????? ??????
pipe_fit=pipe.fit(X_train, y_train)
# ???????????? ??????
preds = pipe_fit.predict(X_test)
return preds, pipe_fit
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Saurabh Jain |
| Solution 2 | ????????? ??? |
