'Problem in text categorization preprocessing
I am using a dataset that include text features (e.g DESCRIPTION is one textual feature in my dataset). My questions
(1) When I check the text (DESCRIPTION variable) after the preprocessing, it looks like it is not processed because I can see words in capital letters and words like An, the, of etc.
(2) If I have two or more text features besides DESCRIPTION, do I need to perform the preprocessing separately for each text features?
(NOTE: Posting on behalf of another person whose post is blocking by SO)
library(quanteda)
library(dplyr)
library(e1071)
d=read.csv("SONAR_RULES.csv", stringsAsFactors = TRUE)
glimpse(d)
Rows: 1,819
Columns: 14
$ PLUGIN_RULE_KEY <fct> InsufficientBranchCoverage, InsufficientLineCo~
$ PLUGIN_CONFIG_KEY <fct> , , , , , , , , , , S1120, , , , StringEqualit~
$ PLUGIN_NAME <fct> common-java, common-java, common-java, common-~
$ DESCRIPTION <fct> "An issue is created on a file as soon as the ~
$ SEVERITY <fct> MAJOR, MAJOR, MAJOR, MAJOR, MAJOR, MAJOR, MINO~
$ NAME <fct> "Branches should have sufficient coverage by t~
$ DEF_REMEDIATION_FUNCTION <fct> LINEAR, LINEAR, LINEAR, LINEAR_OFFSET, LINEAR,~
$ REMEDIATION_GAP_MULT <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
$ DEF_REMEDIATION_BASE_EFFORT <fct> , , , 10min, , , 5min, 5min, 15min, 15min, 1mi~
$ GAP_DESCRIPTION <fct> "number of uncovered conditions", "number of l~
$ SYSTEM_TAGS <fct> "bad-practice", "bad-practice", "convention", ~
$ IS_TEMPLATE <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ DESCRIPTION_FORMAT <fct> HTML, HTML, HTML, HTML, HTML, HTML, HTML, HTML~
$ TYPE <fct> CODE_SMELL, CODE_SMELL, CODE_SMELL, CODE_SMELL~
>
d$REMEDIATION_FUNCTION=NULL
d$DEF_REMEDIATION_GAP_MULT=NULL
d$REMEDIATION_BASE_EFFORT=NULL
set.seed(32984)
indexes <- createDataPartition(d$TYPE, times = 1,
p = 0.7, list = FALSE)
train <- d[indexes,]
test <- d[-indexes,]
review_corpus <- VCorpus( VectorSource(train$DESCRIPTION))
review_corpus <- review_corpus %>%
tm_map(content_transformer(tolower)) %>% # lowercase
tm_map(removeNumbers) %>% # remove numerical character
tm_map(removeWords, stopwords("english")) %>% # remove stopwords (and, the, am)
tm_map(removePunctuation) %>% # remove punctuation mark
tm_map(stemDocument) %>% # stem word (e.g. from walking to walk)
tm_map(stripWhitespace) # strip double white space
train_dtm <- DocumentTermMatrix(review_corpus)
freq <- findFreqTerms(train_dtm, 30)
length(freq)
train_dtm <- train_dtm[ , freq]
train_dtm$ncol
bernoulli_conv <- function(x){
x <- factor(
ifelse(x > 0, 1, 0), levels = c(0,1), labels = c("Absent", "Present")
)
return(x)}
# convert the document-term matrix
train_x <- apply(train_dtm, 2, bernoulli_conv)
# create the target variable
train_label <- train$TYPE
model_bayes <- naiveBayes(train_x, train_label, laplace = 1)
tokenize_text <- function(text){
# Create Corpuse
data_corpus <- VCorpus(VectorSource(text))
# Cleansing
data_corpus_clean <- data_corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(removePunctuation) %>%
tm_map(stemDocument) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
# Document-Term Matrix and use only terms from data train
data_dtm <- DocumentTermMatrix(data_corpus_clean,
control = list(dictionary = freq))
# Bernoulli Converter
data_text <- apply(data_dtm, 2, bernoulli_conv)
return(data_text)
}
test_x <- tokenize_text(test$DESCRIPTION)
test_label <- test$TYPE
pred_test <- predict(model_bayes, test_x)
confusionMatrix(pred_test, test_label, positive = "Recommended")
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
