'Problem in text categorization preprocessing

I am using a dataset that include text features (e.g DESCRIPTION is one textual feature in my dataset). My questions

(1) When I check the text (DESCRIPTION variable) after the preprocessing, it looks like it is not processed because I can see words in capital letters and words like An, the, of etc.

(2) If I have two or more text features besides DESCRIPTION, do I need to perform the preprocessing separately for each text features?

(NOTE: Posting on behalf of another person whose post is blocking by SO)

library(quanteda)
library(dplyr)
library(e1071)
d=read.csv("SONAR_RULES.csv", stringsAsFactors = TRUE)

glimpse(d)

Rows: 1,819
Columns: 14
$ PLUGIN_RULE_KEY             <fct> InsufficientBranchCoverage, InsufficientLineCo~
$ PLUGIN_CONFIG_KEY           <fct> , , , , , , , , , , S1120, , , , StringEqualit~
$ PLUGIN_NAME                 <fct> common-java, common-java, common-java, common-~
$ DESCRIPTION                 <fct> "An issue is created on a file as soon as the ~
$ SEVERITY                    <fct> MAJOR, MAJOR, MAJOR, MAJOR, MAJOR, MAJOR, MINO~
$ NAME                        <fct> "Branches should have sufficient coverage by t~
$ DEF_REMEDIATION_FUNCTION    <fct> LINEAR, LINEAR, LINEAR, LINEAR_OFFSET, LINEAR,~
$ REMEDIATION_GAP_MULT        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
$ DEF_REMEDIATION_BASE_EFFORT <fct> , , , 10min, , , 5min, 5min, 15min, 15min, 1mi~
$ GAP_DESCRIPTION             <fct> "number of uncovered conditions", "number of l~
$ SYSTEM_TAGS                 <fct> "bad-practice", "bad-practice", "convention", ~
$ IS_TEMPLATE                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ DESCRIPTION_FORMAT          <fct> HTML, HTML, HTML, HTML, HTML, HTML, HTML, HTML~
$ TYPE                        <fct> CODE_SMELL, CODE_SMELL, CODE_SMELL, CODE_SMELL~
> 

d$REMEDIATION_FUNCTION=NULL
d$DEF_REMEDIATION_GAP_MULT=NULL
d$REMEDIATION_BASE_EFFORT=NULL

set.seed(32984)
indexes <- createDataPartition(d$TYPE, times = 1,
                               p = 0.7, list = FALSE)
train <- d[indexes,]
test <- d[-indexes,]


review_corpus <- VCorpus( VectorSource(train$DESCRIPTION))

review_corpus <- review_corpus %>%
  tm_map(content_transformer(tolower)) %>% # lowercase
  tm_map(removeNumbers) %>% # remove numerical character
  tm_map(removeWords, stopwords("english")) %>%  # remove stopwords (and, the, am)
  tm_map(removePunctuation) %>%  # remove punctuation mark
  tm_map(stemDocument) %>%  # stem word (e.g. from walking to walk)
  tm_map(stripWhitespace) # strip double white space

train_dtm <- DocumentTermMatrix(review_corpus)


freq <- findFreqTerms(train_dtm, 30)
length(freq)

train_dtm <- train_dtm[ , freq]
train_dtm$ncol


bernoulli_conv <- function(x){
  x <- factor(
    ifelse(x > 0, 1, 0), levels = c(0,1), labels = c("Absent", "Present")
  )
  return(x)}

# convert the document-term matrix  
train_x <- apply(train_dtm, 2, bernoulli_conv)

# create the target variable
train_label <- train$TYPE

model_bayes <- naiveBayes(train_x, train_label, laplace = 1)


tokenize_text <- function(text){
  # Create Corpuse
  data_corpus <- VCorpus(VectorSource(text))
 
  # Cleansing
  data_corpus_clean <-  data_corpus %>%
    tm_map(content_transformer(tolower)) %>%
    tm_map(removeNumbers) %>%
    tm_map(removeWords, stopwords("english")) %>%
    tm_map(removePunctuation) %>%
    tm_map(stemDocument) %>%
    tm_map(stripWhitespace) %>%
    tm_map(stemDocument)
 
  # Document-Term Matrix and use only terms from data train
  data_dtm <- DocumentTermMatrix(data_corpus_clean,
                                 control = list(dictionary = freq))
 
  # Bernoulli Converter
  data_text <- apply(data_dtm, 2, bernoulli_conv)
 
  return(data_text)
}



test_x <- tokenize_text(test$DESCRIPTION)
test_label <- test$TYPE

pred_test <- predict(model_bayes, test_x)


confusionMatrix(pred_test, test_label, positive = "Recommended")


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source