Week10 Assignment

I struggled with this assginment a lot. First of all, when I use tm, it keeps giving me the error message. So I have to google to find the solution. I found out that I need to update my R so that the Rstudio can install tm correctly.

library(tm)

## Loading required package: NLP

library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Get the directory
ham <- "/Users/vivian/Desktop/SPS/607/Assignment11/easy_ham_2/"
spam <- "/Users/vivian/Desktop/SPS/607/Assignment11/spam_2/"
#Sample the corpuses and add meta labels
spamcorpdat <- Corpus(DirSource(spam), readerControl = list(language="lat"))
hamcorpdat <- Corpus(DirSource(ham), readerControl = list(language="lat"))
spamcorpdat <- spamcorpdat[1:500]
hamcorpdat <- hamcorpdat[1:2500]
spamcorpdat <- sample(spamcorpdat, 250)
hamcorpdat <- sample(hamcorpdat, 250)
meta(spamcorpdat, tag = "type") <- "spam"
meta(hamcorpdat, tag = "type") <- "ham"
spam_corpus <- c (spamcorpdat, hamcorpdat, recursive = T)
spamcorpusR <- sample(spam_corpus)
spamcorpusR

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 500

# Cleaning methods. I tried to use "meta" of tm at the beginning but that does not work. I struggled with this part also. 

corpus_clean <- tm_map(spamcorpusR,   tolower) %>% tm_map(removeNumbers) %>% tm_map(removeWords, stopwords()) %>% tm_map(removePunctuation) %>% tm_map(stripWhitespace) %>% tm_map(PlainTextDocument)

spamtm <- DocumentTermMatrix(corpus_clean)
spamtm

## <<DocumentTermMatrix (documents: 500, terms: 23074)>>
## Non-/sparse entries: 75795/11461205
## Sparsity           : 99%
## Maximal term length: 239
## Weighting          : term frequency (tf)

spamtype <- unlist(meta(corpus_clean, "type")[,1])
head(spamtype,5)

## [1] "spam" "ham"  "ham"  "spam" "ham"

#Prepare the container
N <- length(spamtype)
container <- create_container(
  spamtm,
  labels = spamtype,
  trainSize = 1:400,
  testSize = 401:N,
  virgin = FALSE
)

slotNames(container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

# Training and classifying
svm <- train_model(container, "SVM")
svm_out <- classify_model(container, svm)
head(svm_out)

##   SVM_LABEL  SVM_PROB
## 1       ham 0.9875188
## 2      spam 0.9834110
## 3       ham 0.9875188
## 4      spam 0.8868934
## 5      spam 0.7726815
## 6       ham 0.9875188

Week10 Assignment

Vivian Kong

11/4/2016