'SVM rtexttools n-gram : why got similar accuracy for unigram bigram and trigram

I want to get SVM classification accuracy using n-gram (unigram, bigram, and trigram). However, the accuracy that I calculate for three different n-gram is similar. I want to seek help on how can I enhance my code in order to get correct accuracy reading?

Read dataset

df.data <- data.frame(
  df_id = c (1:15),
  df_content = c("Sleep difficulties are very common",
                 "Getting high quality sleep is key for good health",
                 "Sleeping well is just as important as exercise and eating wisely",
                 "two most common sleep issues - insomnia and obstructive sleep apnea",
                 "One in three people have problems sleeping at some point in their lives", 
                 "Talk to your health care team to learn about alternatives to sleep medications, including remote insomnia treatment options", 
                 "Effective treatments are available for both Insomnia and Sleep Apnea",
                 "Avoid using electronic devices in the bedroom",
                 "Avoid alcohol and caffeine before bedtime",
                 "Identify stressors & continue to manage stress",
                 "Always shower before you go to bed. It will make a lot of difference.",
                 "If you are eating meat and other kinds of meals, it is best to eat at least three to four hours before you go to bed so that the digestion is over",
                 "Just light one little lamp somewhere in the room where you sleep and you will see that these things will completely disappear",
                 "Sleep is a state where you are on the edge between the world of sounds and the world of silence, but you can only move into the world of silence when you are aware.",
                 "Improving sleep quality does not mean sleeping like a stone."),
  stringsAsFactors = FALSE)

df.data

get sentiment based on AFINN

library("syuzhet")
df.data.sentiment <- get_sentiment(df.data$df_content, method="afinn")
df.data.sentiment

convertto3score <- function (convert1) {
  convert2 <- {}
  z <- 1
  
  while(z <= length(convert1)){
    if (convert1[z] < 0){
      convert2[z] = -1
    } else {
      if (convert1[z] > 0) {
        convert2[z] = 1
      } else {
        convert2[z] = 0
      }
    }
    z <- z+1
  }
  return(convert2)
}

df.data.sentiment<- convertto3score(df.data.sentiment)
df.data.sentiment

df.data$sentiment_score <- df.data.sentiment
df.data

factor sentiment_score

df.data$sentiment_score <- factor(df.data$sentiment_score, levels = c(-1, 0, 1))
df.data$sentiment_score

shuffle the rows in the data frame

set.seed(1234)  
df.data <- df.data[sample(nrow(df.data)),]

create n-gram function --> from create_matrix() in RTextTools package

create_matrix_unigram <- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=1, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE,  stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {
  
  stem_words <- function(x) {
    split <- strsplit(x," ")
    return(wordStem(unlist(split),language=language))
  }
  
  tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
  
  control <- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)
  
  if (ngramLength > 1) { 
    control <- append(control,list(tokenize=tokenize_ngrams),after=7)
  } else {
    control <- append(control,list(tokenize=scan_tokenizer),after=4)
  }
  
  if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=7)
  
  trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
  trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
  
  corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
  matrix <- DocumentTermMatrix(corpus,control=control);
  if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
  
  if (!is.null(originalMatrix)) {
    terms <- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])
    
    weight <- 0
    if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight <- 0.000000001
    amat <- matrix(weight,nrow=nrow(matrix),ncol=length(terms))
    colnames(amat) <- terms
    rownames(amat) <- rownames(matrix)
    
    fixed <- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)
    matrix <- fixed
  }
  
  matrix <- matrix[,sort(colnames(matrix))]
  
  gc()
  return(matrix)
}

create_matrix_bigram <- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=2, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE,  stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {
  
  stem_words <- function(x) {
    split <- strsplit(x," ")
    return(wordStem(unlist(split),language=language))
  }
  
  tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
  
  control <- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)
  
  if (ngramLength > 1) { 
    control <- append(control,list(tokenize=tokenize_ngrams),after=7)
  } else {
    control <- append(control,list(tokenize=scan_tokenizer),after=4)
  }
  
  if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=7)
  
  trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
  trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
  
  corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
  matrix <- DocumentTermMatrix(corpus,control=control);
  if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
  
  if (!is.null(originalMatrix)) {
    terms <- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])
    
    weight <- 0
    if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight <- 0.000000001
    amat <- matrix(weight,nrow=nrow(matrix),ncol=length(terms))
    colnames(amat) <- terms
    rownames(amat) <- rownames(matrix)
    
    fixed <- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)
    matrix <- fixed
  }
  
  matrix <- matrix[,sort(colnames(matrix))]
  
  gc()
  return(matrix)
}

create_matrix_trigram <- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=3, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTfIdf) {
  
  stem_words <- function(x) {
    split <- strsplit(x," ")
    return(wordStem(unlist(split),language=language))
  }
  
  tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
  
  control <- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)
  
  if (ngramLength > 1) { 
    control <- append(control,list(tokenize=tokenize_ngrams),after=7)
  } else {
    control <- append(control,list(tokenize=scan_tokenizer),after=4)
  }
  
  if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=7)
  
  trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
  trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
  
  corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
  matrix <- DocumentTermMatrix(corpus,control=control);
  if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)
  
  if (!is.null(originalMatrix)) {
    terms <- colnames(originalMatrix[,which(!colnames(originalMatrix) %in% colnames(matrix))])
    
    weight <- 0
    if (attr(originalMatrix,"weighting")[2] =="tf-idf") weight <- 0.000000001
    amat <- matrix(weight,nrow=nrow(matrix),ncol=length(terms))
    colnames(amat) <- terms
    rownames(amat) <- rownames(matrix)
    
    fixed <- as.DocumentTermMatrix(cbind(matrix[,which(colnames(matrix) %in% colnames(originalMatrix))],amat),weighting=weighting)
    matrix <- fixed
  }
  
  matrix <- matrix[,sort(colnames(matrix))]
  
  gc()
  return(matrix)
}

model bag of word

library("RTextTools")
library("tm")

matrix = create_matrix_unigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 1)

#matrix = create_matrix_bigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 2)

#matrix = create_matrix_trigram(df.data[, 2], language = "english", weighting = weightTfIdf, ngramLength = 3)

cross validation only for the first 7 rows

containercv = create_container(matrix, df.data[, 3], trainSize = 1:7, virgin = FALSE)

cvres <- cross_validate(containercv, nfold=10, algorithm="SVM", seed=1234)

final model fitting

trainids <- seq(1, floor(nrow(df.data)*0.7))
testids <- seq(floor(nrow(df.data)*0.3)+1, nrow(df.data))

containerfinal = create_container(matrix, df.data[, 3], trainSize = trainids, virgin = FALSE)

models = train_models(containerfinal, algorithms = "SVM")

evaluation

texts <- df.data[, 2][testids]
trueclass <- df.data[, 3][testids]

testmatrix = create_matrix_unigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 1, originalMatrix = matrix)

# testmatrix = create_matrix_bigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 2, originalMatrix = matrix)

# testmatrix = create_matrix_trigram(texts, language = "english", weighting = weightTfIdf, ngramLength = 3, originalMatrix = matrix)

results = predict(models[[1]], testmatrix)
table(trueclass, results)

# accuracy
sum(trueclass==results)/length(results)

# unigram accuracy --> [1] 0.5454545
# bigram accuracy --> [1] 0.5454545
# trigram accuracy --> [1] 0.5454545

r string

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'SVM rtexttools n-gram : why got similar accuracy for unigram bigram and trigram

Sources

Related Questions