library(tm)
## Loading required package: NLP
library(stringr)
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
spam_dir <- "C:/Users/xb645tb/Downloads/spam"
spam <- list.files(spam_dir)
tmp <- readLines(stringr::str_c(spam_dir,'/', spam[1]))
tmp <- str_c(tmp, collapse = "")
txt_corpus <- VCorpus(VectorSource(tmp))
meta(txt_corpus[[1]], "classification") <- "spam"
n <- 1
for (i in 2:length(spam)) {
tmp <- readLines(stringr::str_c(spam_dir,'/', spam[i]))
tmp <- str_c(tmp, collapse = "")
tmp_corpus <- VCorpus(VectorSource(tmp))
txt_corpus <- c(txt_corpus, tmp_corpus)
n <- n + 1
meta(txt_corpus[[n]], "classification") <- "spam"
}
## Warning in readLines(stringr::str_c(spam_dir, "/", spam[i])):
## incomplete final line found on 'C:/Users/xb645tb/Downloads/spam/
## 0143.260a940290dcb61f9327b224a368d4af'
ham_dir <- "C:/Users/xb645tb/Downloads/easy_ham"
ham <- list.files(ham_dir)
for (i in 1:length(ham)) {
tmp <- readLines(stringr::str_c(ham_dir,'/', ham[i]))
tmp <- str_c(tmp, collapse = "")
n <- n + 1
tmp_corpus <- VCorpus(VectorSource(tmp))
txt_corpus <- c(txt_corpus, tmp_corpus)
meta(txt_corpus[[n]], "classification") <- "ham"
}
txt_corpus <- sample(txt_corpus)
txt_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3051
temp <- tm_map(txt_corpus, content_transformer(removePunctuation))
temp <- tm_map(temp, content_transformer(removeNumbers))
temp <- tm_map(temp, content_transformer(stripWhitespace))
temp <- tm_map(temp, content_transformer(tolower))
temp <- tm_map(temp, content_transformer(removePunctuation))
temp <- tm_map(temp, removeWords, words = stopwords("en"))
dtm <- TermDocumentMatrix(temp)
dtm <- removeSparseTerms(dtm, 1-(10/length(temp)))
dtm
## <<TermDocumentMatrix (terms: 5534, documents: 3051)>>
## Non-/sparse entries: 333479/16550755
## Sparsity : 98%
## Maximal term length: 98
## Weighting : term frequency (tf)
class <- as.vector(unlist(meta(temp, type = "local", tag = "classification")))
meta_data <- data.frame(type = unlist(class))
head(class, 10)
## [1] "ham" "ham" "ham" "ham" "ham" "ham" "spam" "ham" "ham" "ham"
container <- create_container(dtm,labels = class,
trainSize = 1:3051,
virgin = FALSE)
slotNames(container)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
tree = train_model(container, "TREE")
treeOut = classify_model(container, tree)
head(treeOut)
## TREE_LABEL TREE_PROB
## 1 ham 1
## 2 ham 1
## 3 ham 1
## 4 ham 1
## 5 ham 1
## 6 ham 1
head(class)
## [1] "ham" "ham" "ham" "ham" "ham" "ham"
Accourding to the result we can see that the fist of predication is correct with porbablity grater than 0.5