I struggled with this assginment a lot. First of all, when I use tm, it keeps giving me the error message. So I have to google to find the solution. I found out that I need to update my R so that the Rstudio can install tm correctly.
library(tm)
## Loading required package: NLP
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Get the directory
ham <- "/Users/vivian/Desktop/SPS/607/Assignment11/easy_ham_2/"
spam <- "/Users/vivian/Desktop/SPS/607/Assignment11/spam_2/"
#Sample the corpuses and add meta labels
spamcorpdat <- Corpus(DirSource(spam), readerControl = list(language="lat"))
hamcorpdat <- Corpus(DirSource(ham), readerControl = list(language="lat"))
spamcorpdat <- spamcorpdat[1:500]
hamcorpdat <- hamcorpdat[1:2500]
spamcorpdat <- sample(spamcorpdat, 250)
hamcorpdat <- sample(hamcorpdat, 250)
meta(spamcorpdat, tag = "type") <- "spam"
meta(hamcorpdat, tag = "type") <- "ham"
spam_corpus <- c (spamcorpdat, hamcorpdat, recursive = T)
spamcorpusR <- sample(spam_corpus)
spamcorpusR
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 1
## Content: documents: 500
# Cleaning methods. I tried to use "meta" of tm at the beginning but that does not work. I struggled with this part also.
corpus_clean <- tm_map(spamcorpusR, tolower) %>% tm_map(removeNumbers) %>% tm_map(removeWords, stopwords()) %>% tm_map(removePunctuation) %>% tm_map(stripWhitespace) %>% tm_map(PlainTextDocument)
spamtm <- DocumentTermMatrix(corpus_clean)
spamtm
## <<DocumentTermMatrix (documents: 500, terms: 23074)>>
## Non-/sparse entries: 75795/11461205
## Sparsity : 99%
## Maximal term length: 239
## Weighting : term frequency (tf)
spamtype <- unlist(meta(corpus_clean, "type")[,1])
head(spamtype,5)
## [1] "spam" "ham" "ham" "spam" "ham"
#Prepare the container
N <- length(spamtype)
container <- create_container(
spamtm,
labels = spamtype,
trainSize = 1:400,
testSize = 401:N,
virgin = FALSE
)
slotNames(container)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
# Training and classifying
svm <- train_model(container, "SVM")
svm_out <- classify_model(container, svm)
head(svm_out)
## SVM_LABEL SVM_PROB
## 1 ham 0.9875188
## 2 spam 0.9834110
## 3 ham 0.9875188
## 4 spam 0.8868934
## 5 spam 0.7726815
## 6 ham 0.9875188