Text Mining

Corpus Link: https://spamassassin.apache.org/publiccorpus/

Corpus consists of Spam and ham emails. These emails are analyzed with a sample size of 300 emails. Prediction is made on the future emails.

Loading all the necessary files to corpus

#Ham
filenames_ham <-  list.files("C:/Users/paperspace/Google Drive/CUNY/Courses/CUNY-repository/607/Week 10 - Text mining/Data/20030228_easy_ham.tar/20030228_easy_ham/easy_ham/")


full_ham_corpus <- Corpus(VectorSource(""))


for(i in 1:length(filenames_ham)) {

temp_ham <- readLines(str_c("C:/Users/paperspace/Google Drive/CUNY/Courses/CUNY-repository/607/Week 10 - Text mining/Data/20030228_easy_ham.tar/20030228_easy_ham/easy_ham/",filenames_ham[i])) %>% str_c(collapse = "")

temp_ham_corpus <- Corpus(VectorSource(temp_ham))

full_ham_corpus <- c(full_ham_corpus,temp_ham_corpus)
}

#spam

filenames_spam <-  list.files("C:/Users/paperspace/Google Drive/CUNY/Courses/CUNY-repository/607/Week 10 - Text mining/Data/20050311_spam_2.tar/20050311_spam_2/spam_2")


full_spam_corpus <- Corpus(VectorSource(""))


for(i in 1:length(filenames_spam)) {

temp_spam <- readLines(str_c("C:/Users/paperspace/Google Drive/CUNY/Courses/CUNY-repository/607/Week 10 - Text mining/Data/20050311_spam_2.tar/20050311_spam_2/spam_2/",filenames_spam[i])) %>% str_c(collapse = "")

temp_spam_corpus <- Corpus(VectorSource(temp_spam))

full_spam_corpus <- c(full_spam_corpus,temp_spam_corpus)
}

Sample data setup

Combine all the datasets which has spam and ham emails

total_corpus <- Corpus(VectorSource(""))
total_corpus1 <- Corpus(VectorSource(""))


total_corpus <- c(full_ham_corpus,full_spam_corpus)

total_corpus1 <- total_corpus

Create transformations

Perform transformations on the core corpus

total_corpus1 <- tm_map(total_corpus1, removeNumbers)

total_corpus1 <- tm_map(total_corpus1, removeWords,words=stopwords("en"))


total_corpus1 <- tm_map(total_corpus1, str_replace_all, pattern = "[[:punct:]]", replacement = " ")


total_corpus1 <- tm_map(total_corpus1,stripWhitespace)


total_corpus1 <- tm_map(total_corpus1,removePunctuation)



total_corpus1 <- tm_map(total_corpus1, tolower)



total_corpus1 <- tm_map(total_corpus1, stemDocument)

total_corpus1 <- tm_map(total_corpus1, PlainTextDocument)



for(i in 1:300){
 meta(total_corpus1[[i]], "category") <- "ham"
}


for(i in 2502:2801){
   meta(total_corpus1[[i]], "category") <- "spam"
}

rearrange_total_corpus <- c(total_corpus1[c(2:300)],total_corpus1[c(2503:2801)],total_corpus1[c(301:2501)],
                            total_corpus1[c(2802:length(total_corpus1))])

Create term document matrix

Create a document term matrix and remove the sparse terms at 80%

dtm_total_corpus <- DocumentTermMatrix(rearrange_total_corpus)

dtm_total_corpus <- removeSparseTerms(dtm_total_corpus,.80)

Create container

Gather all the documents in a container

category_label <- unlist(meta(rearrange_total_corpus,"category"))

container <- create_container(dtm_total_corpus,category_label,trainSize = 1:length(unlist(meta(rearrange_total_corpus,"category"))),testSize = (length(unlist(meta(rearrange_total_corpus,"category"))) +1) :length(rearrange_total_corpus),virgin = FALSE)

create an supervised model

Train the SVM model and show the output

svm_model <- train_model(container, "SVM")
svm_out <- classify_model(container, svm_model)

datatable(svm_out)

summary(svm_out)

##  SVM_LABEL      SVM_PROB     
##  ham :1899   Min.   :0.5000  
##  spam:1399   1st Qu.:0.8367  
##              Median :0.9778  
##              Mean   :0.9043  
##              3rd Qu.:0.9911  
##              Max.   :1.0000

Conclusion: The created model predicts the future emails. It correctly predicts around 90% of emails. We can provide more training corpus to get the perfect model.