Project4

library(tm)

## Loading required package: NLP

library(stringr)
library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

create txt corpus for ham and spam according to the book

spam_dir <- "C:/Users/xb645tb/Downloads/spam"
spam <- list.files(spam_dir)
tmp <- readLines(stringr::str_c(spam_dir,'/', spam[1]))
tmp <- str_c(tmp, collapse = "")
txt_corpus <- VCorpus(VectorSource(tmp))
meta(txt_corpus[[1]], "classification") <- "spam"


n <- 1
for (i in 2:length(spam)) {
  tmp <- readLines(stringr::str_c(spam_dir,'/', spam[i]))
  tmp <- str_c(tmp, collapse = "")
  tmp_corpus <- VCorpus(VectorSource(tmp))
  txt_corpus <- c(txt_corpus, tmp_corpus) 
  n <- n + 1
  meta(txt_corpus[[n]], "classification") <- "spam"
}

## Warning in readLines(stringr::str_c(spam_dir, "/", spam[i])):
## incomplete final line found on 'C:/Users/xb645tb/Downloads/spam/
## 0143.260a940290dcb61f9327b224a368d4af'

ham_dir <- "C:/Users/xb645tb/Downloads/easy_ham"
ham <- list.files(ham_dir)

for (i in 1:length(ham)) {
  tmp <- readLines(stringr::str_c(ham_dir,'/', ham[i]))
  tmp <- str_c(tmp, collapse = "")
  n <- n + 1
  tmp_corpus <- VCorpus(VectorSource(tmp))
  txt_corpus <- c(txt_corpus, tmp_corpus)
  meta(txt_corpus[[n]], "classification") <- "ham"
}


txt_corpus <- sample(txt_corpus)
txt_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3051

clean document and create docuemnt term matricx according to the book

temp <- tm_map(txt_corpus, content_transformer(removePunctuation))
temp <- tm_map(temp, content_transformer(removeNumbers))
temp <- tm_map(temp, content_transformer(stripWhitespace))
temp <- tm_map(temp, content_transformer(tolower))
temp <- tm_map(temp, content_transformer(removePunctuation))
temp <- tm_map(temp, removeWords, words = stopwords("en"))


dtm <- TermDocumentMatrix(temp)
dtm <- removeSparseTerms(dtm, 1-(10/length(temp)))
dtm

## <<TermDocumentMatrix (terms: 5534, documents: 3051)>>
## Non-/sparse entries: 333479/16550755
## Sparsity           : 98%
## Maximal term length: 98
## Weighting          : term frequency (tf)

list leabel and create container according to the book

class <- as.vector(unlist(meta(temp, type = "local", tag = "classification")))
meta_data <- data.frame(type = unlist(class))
head(class, 10)

##  [1] "ham"  "ham"  "ham"  "ham"  "ham"  "ham"  "spam" "ham"  "ham"  "ham"

container <- create_container(dtm,labels = class,
                              trainSize = 1:3051,
                              virgin = FALSE)

slotNames(container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

Train data with decsion tree model

tree = train_model(container, "TREE")
treeOut = classify_model(container, tree)

head(treeOut)

##   TREE_LABEL TREE_PROB
## 1        ham         1
## 2        ham         1
## 3        ham         1
## 4        ham         1
## 5        ham         1
## 6        ham         1

head(class)

## [1] "ham" "ham" "ham" "ham" "ham" "ham"

Accourding to the result we can see that the fist of predication is correct with porbablity grater than 0.5