'SVM rtexttools n-gram : why got similar accuracy for unigram bigram and trigram
I want to get SVM classification accuracy using n-gram (unigram, bigram, and trigram). However, the accuracy that I calculate for three different n-gram is similar. I want to seek help on how can I enhance my code in order to get correct accuracy reading?
Read dataset
df.data <- data.frame(
df_id = c (1:15),
df_content = c("Sleep difficulties are very common",
"Getting high quality sleep is key for good health",
"Sleeping well is just as important as exercise and eating wisely",
"two most common sleep issues - insomnia and obstructive sleep apnea",
"One in three people have problems sleeping at some point in their lives",
"Talk to your health care team to learn about alternatives to sleep medications, including remote insomnia treatment options",
"Effective treatments are available for both Insomnia and Sleep Apnea",
"Avoid using electronic devices in the bedroom",
"Avoid alcohol and caffeine before bedtime",
"Identify stressors & continue to manage stress",
"Always shower before you go to bed. It will make a lot of difference.",
"If you are eating meat and other kinds of meals, it is best to eat at least three to four hours before you go to bed so that the digestion is over",
"Just light one little lamp somewhere in the room where you sleep and you will see that these things will completely disappear",
"Sleep is a state where you are on the edge between the world of sounds and the world of silence, but you can only move into the world of silence when you are aware.",
"Improving sleep quality does not mean sleeping like a stone."),
stringsAsFactors = FALSE)
df.data
get sentiment based on AFINN
library("syuzhet")
df.data.sentiment <- get_sentiment(df.data$df_content, method="afinn")
df.data.sentiment
convertto3score <- function (convert1) {
convert2 <- {}
z <- 1
while(z <= length(convert1)){
if (convert1[z] < 0){
convert2[z] = -1
} else {
if (convert1[z] > 0) {
convert2[z] = 1
} else {
convert2[z] = 0
}
}
z <- z+1
}
return(convert2)
}
df.data.sentiment<- convertto3score(df.data.sentiment)
df.data.sentiment
df.data$sentiment_score <- df.data.sentiment
df.data
factor sentiment_score
df.data$sentiment_score <- factor(df.data$sentiment_score, levels = c(-1, 0, 1))
df.data$sentiment_score
shuffle the rows in the data frame
set.seed(1234)
df.data <- df.data[sample(nrow(df.data)),]
create n-gram function --> from create_matrix() in RTextTools package
create_matrix_unigram <- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=1, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {
stem_words <- function(x) {
split <- strsplit(x," ")
return(wordStem(unlist(split),language=language))
}
tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
control <- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)
if (ngramLength > 1) {
control <- append(control,list(tokenize=tokenize_ngrams),after=7)
} else {
control <- append(control,list(tokenize=scan_tokenizer),after=4)
}
if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=7)
trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
matrix <- DocumentTermMatrix(corpus,control=control);
if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
if (!is.null(originalMatrix)) {
terms <- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])
weight <- 0
if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight <- 0.000000001
amat <- matrix(weight,nrow=nrow(matrix),ncol=length(terms))
colnames(amat) <- terms
rownames(amat) <- rownames(matrix)
fixed <- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)
matrix <- fixed
}
matrix <- matrix[,sort(colnames(matrix))]
gc()
return(matrix)
}
create_matrix_bigram <- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=2, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {
stem_words <- function(x) {
split <- strsplit(x," ")
return(wordStem(unlist(split),language=language))
}
tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
control <- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)
if (ngramLength > 1) {
control <- append(control,list(tokenize=tokenize_ngrams),after=7)
} else {
control <- append(control,list(tokenize=scan_tokenizer),after=4)
}
if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=7)
trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
matrix <- DocumentTermMatrix(corpus,control=control);
if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
if (!is.null(originalMatrix)) {
terms <- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])
weight <- 0
if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight <- 0.000000001
amat <- matrix(weight,nrow=nrow(matrix),ncol=length(terms))
colnames(amat) <- terms
rownames(amat) <- rownames(matrix)
fixed <- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)
matrix <- fixed
}
matrix <- matrix[,sort(colnames(matrix))]
gc()
return(matrix)
}
create_matrix_trigram <- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=3, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {
stem_words <- function(x) {
split <- strsplit(x," ")
return(wordStem(unlist(split),language=language))
}
tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
control <- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)
if (ngramLength > 1) {
control <- append(control,list(tokenize=tokenize_ngrams),after=7)
} else {
control <- append(control,list(tokenize=scan_tokenizer),after=4)
}
if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=7)
trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
matrix <- DocumentTermMatrix(corpus,control=control);
if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
if (!is.null(originalMatrix)) {
terms <- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])
weight <- 0
if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight <- 0.000000001
amat <- matrix(weight,nrow=nrow(matrix),ncol=length(terms))
colnames(amat) <- terms
rownames(amat) <- rownames(matrix)
fixed <- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)
matrix <- fixed
}
matrix <- matrix[,sort(colnames(matrix))]
gc()
return(matrix)
}
model bag of word
library("RTextTools")
library("tm")
matrix = create_matrix_unigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 1)
#matrix = create_matrix_bigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 2)
#matrix = create_matrix_trigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 3)
cross validation only for the first 7 rows
containercv = create_container(matrix, df.data[, 3], trainSize = 1:7, virgin = FALSE)
cvres <- cross_validate(containercv, nfold=10, algorithm="SVM", seed=1234)
final model fitting
trainids <- seq(1, floor(nrow(df.data)*0.7))
testids <- seq(floor(nrow(df.data)*0.3)+1, nrow(df.data))
containerfinal = create_container(matrix, df.data[, 3], trainSize = trainids, virgin = FALSE)
models = train_models(containerfinal, algorithms = "SVM")
evaluation
texts <- df.data[, 2][testids]
trueclass <- df.data[, 3][testids]
testmatrix = create_matrix_unigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 1, originalMatrix = matrix)
# testmatrix = create_matrix_bigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 2, originalMatrix = matrix)
# testmatrix = create_matrix_trigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 3, originalMatrix = matrix)
results = predict(models[[1]], testmatrix)
table(trueclass, results)
# accuracy
sum(trueclass==results)/length(results)
# unigram accuracy --> [1] 0.5454545
# bigram accuracy --> [1] 0.5454545
# trigram accuracy --> [1] 0.5454545
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
